The goal of this script is to generate a Seurat object for sample 2021_36.

  • removal of cells based on quality control metrics
  • normalization with LogNormalize, then doublets detection using scran hybrid and scDblFinder method, and doublet cells removal
  • normalization with LogNormalize, for only the remaining cells
  • cell cycle and cell type annotation
  • dimensionality reduction using PCA
  • projection using tSNE and UMAP
library(dplyr)
library(patchwork)
library(ggplot2)

.libPaths()
## [1] "/usr/local/lib/R/library"

Preparation

In this section, we set the global settings of the analysis. We will store data there :

out_dir = "."

We load the parameters :

sample_name = params$sample_name # "2021_31"
# sample_name = "2021_31"

Input count matrix is there :

count_matrix_dir = paste0(out_dir, "/input/", sample_name)
list.files(count_matrix_dir)
## [1] "barcodes.tsv.gz" "features.tsv.gz" "matrix.mtx.gz"

We load the markers and specific colors for each cell type :

cell_markers = readRDS(paste0(out_dir, "/../1_metadata/hs_hd_cell_markers.rds"))
cell_markers = lapply(cell_markers, FUN = toupper)
lengths(cell_markers)
##          CD4 T cells          CD8 T cells     Langerhans cells 
##                   13                   13                    9 
##          macrophages              B cells              cuticle 
##                   10                   16                   15 
##               cortex              medulla                  IRS 
##                   16                   10                   16 
##        proliferative               HF-SCs            IFE basal 
##                   20                   17                   16 
## IFE granular spinous                  ORS          melanocytes 
##                   17                   15                   10 
##            sebocytes 
##                    8

Here are custom colors for each cell type :

color_markers = readRDS(paste0(out_dir, "/../1_metadata/hs_hd_color_markers.rds"))

data.frame(cell_type = names(color_markers),
           color = unlist(color_markers)) %>%
  ggplot2::ggplot(., aes(x = cell_type, y = 0, fill = cell_type)) +
  ggplot2::geom_point(pch = 21, size = 5) +
  ggplot2::scale_fill_manual(values = unlist(color_markers), breaks = names(color_markers)) +
  ggplot2::theme_classic() +
  ggplot2::theme(legend.position = "none",
                 axis.line = element_blank(),
                 axis.title = element_blank(),
                 axis.ticks = element_blank(),
                 axis.text.y = element_blank())

We load markers to display on the dotplot :

dotplot_markers = readRDS(paste0(out_dir, "/../1_metadata/hs_hd_dotplot_markers.rds"))
dotplot_markers = lapply(dotplot_markers, FUN = toupper)
dotplot_markers
## $`CD4 T cells`
## [1] "PTPRC" "CD3E"  "CD4"  
## 
## $`CD8 T cells`
## [1] "CD3E" "CD8A"
## 
## $`Langerhans cells`
## [1] "CD207" "CPVL" 
## 
## $macrophages
## [1] "TREM2" "MSR1" 
## 
## $`B cells`
## [1] "CD79A" "CD79B"
## 
## $cuticle
## [1] "MSX2"  "KRT32" "KRT35"
## 
## $cortex
## [1] "KRT31" "PRR9" 
## 
## $medulla
## [1] "BAMBI"   "ADLH1A3"
## 
## $IRS
## [1] "KRT71" "KRT73"
## 
## $proliferative
## [1] "TOP2A" "MCM5"  "TK1"  
## 
## $`HF-SCs`
## [1] "KRT14"  "CXCL14"
## 
## $`IFE basal`
## [1] "COL17A1" "KRT15"  
## 
## $`IFE granular spinous`
## [1] "SPINK5" "KRT1"  
## 
## $ORS
## [1] "KRT16" "KRT6C"
## 
## $melanocytes
## [1] "DCT"   "MLANA"
## 
## $sebocytes
## [1] "CLMP"  "PPARG"

We load metadata for this sample :

sample_info = readRDS(paste0(out_dir, "/../1_metadata/hs_hd_sample_info.rds"))
sample_info %>%
  dplyr::filter(project_name == sample_name)
##   project_name sample_type sample_identifier platform gender location
## 1      2021_36          HS              HS_2      10X      M axillary
##   laboratory   color
## 1        Our #E26E47

These is a parameter for different functions :

cl = aquarius::create_parallel_instance(nthreads = 3L)
cut_log_nCount_RNA = 6
cut_nFeature_RNA = 500
cut_percent.mt = 20
cut_percent.rb = 50

Load count matrix

In this section, we load the raw count matrix. Then, we applied an empty droplets filtering.

sobj = aquarius::load_sc_data(data_path = count_matrix_dir,
                              sample_name = sample_name,
                              my_seed = 1337L)
## [1]   27955 6794880
## [1] 13856080
## [1] 27955  2794
## [1] 12310357
## [1] 0.8884444
sobj
## An object of class Seurat 
## 27955 features across 2794 samples within 1 assay 
## Active assay: RNA (27955 features, 0 variable features)

(Time to run : 110.2 s)

In genes metadata, we add the Ensembl ID. The sobj@assays$RNA@meta.features dataframe contains three information :

  • rownames : gene names stored as the dimnames of the count matrix. Duplicated gene names will have a .1 at the end of their name
  • Ensembl_ID : EnsemblID, as stored in the features.tsv.gz file
  • gene_name : gene_name, as stored in the features.tsv.gz file. Duplicated gene names will have the same name.
features_df = read.csv(paste0(count_matrix_dir, "/features.tsv.gz"), sep = "\t", header = 0)
features_df = features_df[, c(1:2)]
colnames(features_df) = c("Ensembl_ID", "gene_name")
rownames(features_df) = rownames(sobj) # mandatory for Seurat::FindVariableFeatures

sobj@assays$RNA@meta.features = features_df
rm(features_df)

head(sobj@assays$RNA@meta.features)
##                  Ensembl_ID   gene_name
## MIR1302-2HG ENSG00000243485 MIR1302-2HG
## FAM138A     ENSG00000237613     FAM138A
## OR4F5       ENSG00000186092       OR4F5
## AL627309.1  ENSG00000238009  AL627309.1
## AL627309.3  ENSG00000239945  AL627309.3
## AL627309.4  ENSG00000241599  AL627309.4

We add the same columns as in metadata :

row_oi = (sample_info$project_name == sample_name)

sobj$project_name = sample_name
sobj$sample_identifier = sample_info[row_oi, "sample_identifier"]
sobj$sample_type = sample_info[row_oi, "sample_type"]
sobj$location = sample_info[row_oi, "location"]
sobj$laboratory = sample_info[row_oi, "laboratory"]

colnames(sobj@meta.data)
## [1] "orig.ident"        "nCount_RNA"        "nFeature_RNA"     
## [4] "log_nCount_RNA"    "project_name"      "sample_identifier"
## [7] "sample_type"       "location"          "laboratory"

Before filtering

Normalization

sobj = Seurat::NormalizeData(sobj,
                             normalization.method = "LogNormalize",
                             assay = "RNA")

sobj = Seurat::FindVariableFeatures(sobj,
                                    assay = "RNA",
                                    nfeatures = 3000)
sobj
## An object of class Seurat 
## 27955 features across 2794 samples within 1 assay 
## Active assay: RNA (27955 features, 3000 variable features)

Projection

We generate a tSNE to visualize cells before filtering.

sobj = aquarius::dimensions_reduction(sobj = sobj,
                                      assay = "RNA",
                                      reduction = "pca",
                                      max_dims = 100,
                                      verbose = FALSE)
Seurat::ElbowPlot(sobj, ndims = 100, reduction = "RNA_pca")

We generate a tSNE with 20 principal components :

ndims = 20
sobj = Seurat::RunTSNE(sobj,
                       reduction = "RNA_pca",
                       dims = 1:ndims,
                       seed.use = 1337L,
                       reduction.name = paste0("RNA_pca_", ndims, "_tsne"))

sobj
## An object of class Seurat 
## 27955 features across 2794 samples within 1 assay 
## Active assay: RNA (27955 features, 3000 variable features)
##  2 dimensional reductions calculated: RNA_pca, RNA_pca_20_tsne

Cell type

We annotate cells for cell type using Seurat::AddModuleScore function.

sobj = aquarius::cell_annot_custom(sobj,
                                   newname = "cell_type",
                                   markers = cell_markers,
                                   use_negative = TRUE,
                                   add_score = TRUE,
                                   verbose = TRUE)

colnames(sobj@meta.data) = stringr::str_replace_all(string = colnames(sobj@meta.data),
                                                    pattern = " ",
                                                    replacement = "_")

sobj$cell_type = factor(sobj$cell_type, levels = names(cell_markers))

table(sobj$cell_type)
## 
##          CD4 T cells          CD8 T cells     Langerhans cells 
##                  126                   88                   43 
##          macrophages              B cells              cuticle 
##                  197                   73                  336 
##               cortex              medulla                  IRS 
##                   56                  123                   70 
##        proliferative               HF-SCs            IFE basal 
##                  228                  364                  304 
## IFE granular spinous                  ORS          melanocytes 
##                   98                  304                  327 
##            sebocytes 
##                   57

(Time to run : 8.52 s)

To justify cell type annotation, we can make a dotplot :

markers = c("PTPRC", "MSX2", "KRT16",
            unique(unlist(dotplot_markers[levels(sobj$cell_type)])))
markers = markers[markers %in% rownames(sobj)]

aquarius::plot_dotplot(sobj, assay = "RNA",
                       column_name = "cell_type",
                       markers = markers,
                       nb_hline = 0) +
  ggplot2::scale_color_gradientn(colors = aquarius:::color_gene) +
  ggplot2::theme(legend.position = "right",
                 legend.box = "vertical",
                 legend.direction = "vertical",
                 axis.title = element_blank(),
                 axis.text = element_text(size = 15))

We can make a barplot to see the composition of each dataset, and visualize cell types on the projection.

df_proportion = as.data.frame(prop.table(table(sobj$orig.ident,
                                               sobj$cell_type)))
colnames(df_proportion) = c("orig.ident", "cell_type", "freq")

quantif = table(sobj$orig.ident) %>%
  as.data.frame.table() %>%
  `colnames<-`(c("orig.ident", "nb_cells"))

# Plot
plot_list = list()

plot_list[[2]] = aquarius::plot_barplot(df = df_proportion,
                                        x = "orig.ident",
                                        y = "freq",
                                        fill = "cell_type",
                                        position = ggplot2::position_fill()) +
  ggplot2::scale_fill_manual(name = "Cell type",
                             values = color_markers[levels(df_proportion$cell_type)],
                             breaks = levels(df_proportion$cell_type)) +
  ggplot2::geom_label(data = quantif, inherit.aes = FALSE,
                      aes(x = orig.ident, y = 1.05, label = nb_cells),
                      label.size = 0)

plot_list[[1]] = Seurat::DimPlot(sobj, group.by = "cell_type") +
  ggplot2::scale_color_manual(values = unlist(color_markers),
                              breaks = names(color_markers)) +
  ggplot2::labs(title = sample_name,
                subtitle = paste0(ncol(sobj), " cells")) +
  Seurat::NoLegend() + Seurat::NoAxes() +
  ggplot2::theme(aspect.ratio = 1,
                 plot.title = element_text(hjust = 0.5),
                 plot.subtitle = element_text(hjust = 0.5))

patchwork::wrap_plots(plot_list, nrow = 1, widths = c(6, 1))

Cell cycle phase

We annotate cells for cell cycle phase using Seurat and cyclone.

cc_columns = aquarius::add_cell_cycle(sobj = sobj,
                                      assay = "RNA",
                                      species_rdx = "hs",
                                      BPPARAM = cl)@meta.data[, c("Seurat.Phase", "Phase")]
## 
##   G1  G2M    S 
## 1532  586  672
sobj$Seurat.Phase = cc_columns$Seurat.Phase
sobj$cyclone.Phase = cc_columns$Phase

table(sobj$Seurat.Phase, sobj$cyclone.Phase)
##      
##         G1  G2M    S
##   G1  1007  317  418
##   G2M  209  186   79
##   S    316   83  175

(Time to run : 157.46 s)

We visualize cell cycle on the projection :

plot_list = list()

plot_list[[2]] = Seurat::DimPlot(sobj, group.by = "Seurat.Phase") +
  ggplot2::labs(title = "Cell Cycle Phase",
                subtitle = "Seurat.Phase") +
  Seurat::NoLegend() + Seurat::NoAxes() +
  ggplot2::theme(aspect.ratio = 1,
                 plot.title = element_text(hjust = 0.5),
                 plot.subtitle = element_text(hjust = 0.5))

plot_list[[1]] = Seurat::DimPlot(sobj, group.by = "cyclone.Phase") +
  ggplot2::labs(title = "Cell Cycle Phase",
                subtitle = "cyclone.Phase") +
  Seurat::NoLegend() + Seurat::NoAxes() +
  ggplot2::theme(aspect.ratio = 1,
                 plot.title = element_text(hjust = 0.5),
                 plot.subtitle = element_text(hjust = 0.5))

patchwork::wrap_plots(plot_list, nrow = 1)

Quality control

In this section, we look at the number of genes expressed by each cell, the number of UMI, the percentage of mitochondrial genes expressed, and the percentage of ribosomal genes expressed. Then, without taking into account the cells expressing low number of genes or have low number of UMI, we identify doublet cells.

We compute four quality metrics :

sobj = Seurat::PercentageFeatureSet(sobj, pattern = "^MT", col.name = "percent.mt")
sobj = Seurat::PercentageFeatureSet(sobj, pattern = "^RP[L|S][0-9]*$", col.name = "percent.rb")

head(sobj@meta.data)
##                    orig.ident nCount_RNA nFeature_RNA log_nCount_RNA
## AAACCCAAGTCGGCAA-1    2021_36      19505         4124       9.878426
## AAACCCACATGGAATA-1    2021_36        220          184       5.393628
## AAACGAAAGCGAGAAA-1    2021_36        283          187       5.645447
## AAACGAACAAGGTTGG-1    2021_36      21207         3906       9.962087
## AAACGCTAGCTCCGAC-1    2021_36        306          221       5.723585
## AAACGCTTCATGCGGC-1    2021_36        307          197       5.726848
##                    project_name sample_identifier sample_type location
## AAACCCAAGTCGGCAA-1      2021_36              HS_2          HS axillary
## AAACCCACATGGAATA-1      2021_36              HS_2          HS axillary
## AAACGAAAGCGAGAAA-1      2021_36              HS_2          HS axillary
## AAACGAACAAGGTTGG-1      2021_36              HS_2          HS axillary
## AAACGCTAGCTCCGAC-1      2021_36              HS_2          HS axillary
## AAACGCTTCATGCGGC-1      2021_36              HS_2          HS axillary
##                    laboratory score_CD4_T_cells score_CD8_T_cells
## AAACCCAAGTCGGCAA-1        Our       -0.26287451       -0.19630172
## AAACCCACATGGAATA-1        Our       -0.04474491       -0.03410599
## AAACGAAAGCGAGAAA-1        Our        0.51627483       -0.03247181
## AAACGAACAAGGTTGG-1        Our       -0.23057851       -0.15345435
## AAACGCTAGCTCCGAC-1        Our       -0.04978122       -0.03124868
## AAACGCTTCATGCGGC-1        Our       -0.03567270       -0.03460655
##                    score_Langerhans_cells score_macrophages score_B_cells
## AAACCCAAGTCGGCAA-1            -0.31844183       -0.18709622  -0.039606646
## AAACCCACATGGAATA-1            -0.08138652       -0.06339439  -0.007903516
## AAACGAAAGCGAGAAA-1            -0.02622376       -0.04417943  -0.004931778
## AAACGAACAAGGTTGG-1            -0.25573095       -0.24478321  -0.057289853
## AAACGCTAGCTCCGAC-1            -0.06720644       -0.04296673  -0.002413793
## AAACGCTTCATGCGGC-1            -0.06974101       -0.07703044  -0.002411620
##                    score_cuticle  score_cortex score_medulla   score_IRS
## AAACCCAAGTCGGCAA-1    -0.2917865  0.0001341305   -0.04557153 -0.16244649
## AAACCCACATGGAATA-1    -0.1065524 -0.0316357773   -0.05722499 -0.04031336
## AAACGAAAGCGAGAAA-1    -0.1147099 -0.0391599103   -0.01204805 -0.04525292
## AAACGAACAAGGTTGG-1    -0.2809439 -0.1117497497   -0.00522252 -0.03352736
## AAACGCTAGCTCCGAC-1    -0.1384133 -0.0559448145   -0.02218862 -0.03666464
## AAACGCTTCATGCGGC-1    -0.1311935 -0.0496628045   -0.03694775 -0.03446216
##                    score_proliferative score_HF-SCs score_IFE_basal
## AAACCCAAGTCGGCAA-1         -0.14284997   -0.1298370     -0.42863033
## AAACCCACATGGAATA-1         -0.03526958   -0.1434347     -0.16755889
## AAACGAAAGCGAGAAA-1         -0.01862761    0.1161159      0.16967451
## AAACGAACAAGGTTGG-1         -0.13465271    0.2743963      0.68422496
## AAACGCTAGCTCCGAC-1         -0.02507185    0.6548029      0.28280453
## AAACGCTTCATGCGGC-1         -0.02548878    0.2628369      0.08002734
##                    score_IFE_granular_spinous  score_ORS score_melanocytes
## AAACCCAAGTCGGCAA-1                -0.25799668 -0.0888419         1.2135326
## AAACCCACATGGAATA-1                -0.06299190 -0.1359450        -0.1921994
## AAACGAAAGCGAGAAA-1                -0.04094054 -0.1419941        -0.1791771
## AAACGAACAAGGTTGG-1                 0.31452143 -0.2308672        -0.5296198
## AAACGCTAGCTCCGAC-1                -0.06181424  0.1481141        -0.2176438
## AAACGCTTCATGCGGC-1                -0.04845482 -0.1624408         0.1897934
##                    score_sebocytes   cell_type Seurat.Phase cyclone.Phase
## AAACCCAAGTCGGCAA-1     -0.04854020 melanocytes           G1             S
## AAACCCACATGGAATA-1     -0.03375543     B cells           G1             S
## AAACGAAAGCGAGAAA-1     -0.01890765 CD4 T cells           G1           G2M
## AAACGAACAAGGTTGG-1      0.09242599   IFE basal           G1            G1
## AAACGCTAGCTCCGAC-1     -0.03976390      HF-SCs            S             S
## AAACGCTTCATGCGGC-1     -0.01765694      HF-SCs           G1           G2M
##                    percent.mt percent.rb
## AAACCCAAGTCGGCAA-1   4.932069   25.65496
## AAACCCACATGGAATA-1   4.545455   11.81818
## AAACGAAAGCGAGAAA-1   1.060071   31.09541
## AAACGAACAAGGTTGG-1  10.199462   27.45791
## AAACGCTAGCTCCGAC-1   4.575163   24.18301
## AAACGCTTCATGCGGC-1   1.954397   31.59609

We get the cell barcodes for the failing cells :

fail_percent.mt = sobj@meta.data %>% dplyr::filter(percent.mt > cut_percent.mt) %>% rownames()
fail_percent.rb = sobj@meta.data %>% dplyr::filter(percent.rb > cut_percent.rb) %>% rownames()
fail_log_nCount_RNA = sobj@meta.data %>% dplyr::filter(log_nCount_RNA < cut_log_nCount_RNA) %>% rownames()
fail_nFeature_RNA = sobj@meta.data %>% dplyr::filter(nFeature_RNA < cut_nFeature_RNA) %>% rownames()

Doublet cells detection

Without taking into account the low UMI and low number of features cells, we identify doublets.

fsobj = subset(sobj, invert = TRUE,
               cells = unique(c(fail_log_nCount_RNA, fail_nFeature_RNA)))
fsobj
## An object of class Seurat 
## 27955 features across 775 samples within 1 assay 
## Active assay: RNA (27955 features, 3000 variable features)
##  2 dimensional reductions calculated: RNA_pca, RNA_pca_20_tsne

On this filtered dataset, we apply doublet cells detection. Just before, we run the normalization, taking into account only the remaining cells.

sobj = Seurat::NormalizeData(sobj,
                             normalization.method = "LogNormalize",
                             assay = "RNA")

sobj = Seurat::FindVariableFeatures(sobj,
                                    assay = "RNA",
                                    nfeatures = 3000)
sobj
## An object of class Seurat 
## 27955 features across 2794 samples within 1 assay 
## Active assay: RNA (27955 features, 3000 variable features)
##  2 dimensional reductions calculated: RNA_pca, RNA_pca_20_tsne

We identify doublet cells :

fsobj = aquarius::find_doublets(sobj = fsobj,
                                BPPARAM = cl)
## [1] 27955   775
## 
## FALSE  TRUE 
##   724    51 
## [18:50:51] WARNING: amalgamation/../src/learner.cc:1095: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
## 
## FALSE  TRUE 
##   701    74 
## 
## FALSE  TRUE 
##   666   109
fail_doublets_consensus = Seurat::WhichCells(fsobj, expression = doublets_consensus.class)
fail_doublets_scDblFinder = Seurat::WhichCells(fsobj, expression = scDblFinder.class)
fail_doublets_hybrid = Seurat::WhichCells(fsobj, expression = hybrid_score.class)

(Time to run : 48.67 s)

We add the information in the non filtered Seurat object :

sobj$doublets_consensus.class = dplyr::case_when(!(colnames(sobj) %in% colnames(fsobj)) ~ NA,
                                                 colnames(sobj) %in% fail_doublets_consensus ~ TRUE,
                                                 !(colnames(sobj) %in% fail_doublets_consensus) ~ FALSE)

sobj$scDblFinder.class = dplyr::case_when(!(colnames(sobj) %in% colnames(fsobj)) ~ NA,
                                          colnames(sobj) %in% fail_doublets_scDblFinder ~ TRUE,
                                          !(colnames(sobj) %in% fail_doublets_scDblFinder) ~ FALSE)

sobj$hybrid_score.class = dplyr::case_when(!(colnames(sobj) %in% colnames(fsobj)) ~ NA,
                                           colnames(sobj) %in% fail_doublets_hybrid ~ TRUE,
                                           !(colnames(sobj) %in% fail_doublets_hybrid) ~ FALSE)

Quality control representation

We can visualize the 4 cells quality with a Venn diagram :

n_filtered = c(fail_percent.mt, fail_percent.rb, fail_log_nCount_RNA, fail_nFeature_RNA) %>%
  unique() %>% length()
percent_filtered = round(100*(n_filtered/ncol(sobj)), 2)

ggvenn::ggvenn(list(percent.mt = fail_percent.mt,
                    percent.rb = fail_percent.rb,
                    log_nCount_RNA = fail_log_nCount_RNA,
                    nFeature_RNA = fail_nFeature_RNA), 
               fill_color = c("#0073C2FF", "#EFC000FF", "orange", "pink"),
               stroke_size = 0.5, set_name_size = 4) +
  ggplot2::labs(title = "Filtered out cells",
                subtitle = paste0(n_filtered, " cells (", percent_filtered, " % of all cells)")) +
  ggplot2::theme(plot.title = element_text(hjust = 0.5, face = "bold"),
                 plot.subtitle = element_text(hjust = 0.5))

Number of UMI

To visualize the threshold for number of UMI, we can make a histogram :

aquarius::plot_qc_density(df = sobj@meta.data,
                          x = "log_nCount_RNA",
                          bins = 200,
                          group_by = "orig.ident",
                          group_color = setNames(sample_info$color,
                                                 nm = sample_info$sample_identifiant),
                          x_thresh = cut_log_nCount_RNA)

Seurat::VlnPlot(sobj, features = "log_nCount_RNA", pt.size = 0.001,
                group.by = "cell_type", cols = color_markers) +
  ggplot2::scale_fill_manual(values = color_markers, breaks = names(color_markers)) +
  ggplot2::geom_hline(yintercept = cut_log_nCount_RNA, col = "red") +
  ggplot2::labs(x = "")

sobj$fail = ifelse(colnames(sobj) %in% fail_log_nCount_RNA,
                   yes = as.character(sobj$cell_type), no = NA)
sobj$fail = factor(sobj$fail, levels = c(levels(sobj$cell_type), NA))

Seurat::DimPlot(sobj, group.by = "fail", na.value = "gray80", cols = color_markers) +
  ggplot2::labs(title = "log_nCount_RNA",
                subtitle = paste0(length(fail_log_nCount_RNA), " cells")) +
  Seurat::NoAxes() +
  ggplot2::theme(aspect.ratio = 1,
                 plot.title = element_text(hjust = 0.5),
                 plot.subtitle = element_text(hjust = 0.5))

Number of features

To visualize the threshold for number of features, we can make a histogram :

aquarius::plot_qc_density(df = sobj@meta.data,
                          x = "nFeature_RNA",
                          bins = 200,
                          group_by = "orig.ident",
                          group_color = setNames(sample_info$color,
                                                 nm = sample_info$sample_identifiant),
                          x_thresh = cut_nFeature_RNA)

Seurat::VlnPlot(sobj, features = "nFeature_RNA", pt.size = 0.001,
                group.by = "cell_type", cols = color_markers) +
  ggplot2::scale_fill_manual(values = color_markers, breaks = names(color_markers)) +
  ggplot2::geom_hline(yintercept = cut_nFeature_RNA, col = "red") +
  ggplot2::labs(x = "")

sobj$fail = ifelse(colnames(sobj) %in% fail_nFeature_RNA,
                   yes = as.character(sobj$cell_type), no = NA)
sobj$fail = factor(sobj$fail, levels = c(levels(sobj$cell_type), NA))

Seurat::DimPlot(sobj, group.by = "fail", na.value = "gray80", cols = color_markers) +
  ggplot2::labs(title = "nFeature_RNA",
                subtitle = paste0(length(fail_nFeature_RNA), " cells")) +
  Seurat::NoAxes() +
  ggplot2::theme(aspect.ratio = 1,
                 plot.title = element_text(hjust = 0.5),
                 plot.subtitle = element_text(hjust = 0.5))

Mitochondrial genes expression

To identify a threshold for mitochondrial gene expression, we can make a histogram :

aquarius::plot_qc_density(df = sobj@meta.data,
                          x = "percent.mt",
                          bins = 200,
                          group_by = "orig.ident",
                          group_color = setNames(sample_info$color,
                                                 nm = sample_info$sample_identifiant),
                          x_thresh = cut_percent.mt)

Seurat::VlnPlot(sobj, features = "percent.mt", pt.size = 0.001,
                group.by = "cell_type", cols = color_markers) +
  ggplot2::scale_fill_manual(values = color_markers, breaks = names(color_markers)) +
  ggplot2::geom_hline(yintercept = cut_percent.mt, col = "red") +
  ggplot2::labs(x = "")

sobj$fail = ifelse(colnames(sobj) %in% fail_percent.mt,
                   yes = as.character(sobj$cell_type), no = NA)
sobj$fail = factor(sobj$fail, levels = c(levels(sobj$cell_type), NA))

Seurat::DimPlot(sobj, group.by = "fail", na.value = "gray80", cols = color_markers) +
  ggplot2::labs(title = "percent.mt",
                subtitle = paste0(length(fail_percent.mt), " cells")) +
  Seurat::NoAxes() +
  ggplot2::theme(aspect.ratio = 1,
                 plot.title = element_text(hjust = 0.5),
                 plot.subtitle = element_text(hjust = 0.5))

Ribosomal genes expression

To identify a threshold for ribosomal gene expression, we can make a histogram :

aquarius::plot_qc_density(df = sobj@meta.data,
                          x = "percent.rb",
                          bins = 200,
                          group_by = "orig.ident",
                          group_color = setNames(sample_info$color,
                                                 nm = sample_info$sample_identifiant),
                          x_thresh = cut_percent.rb)

Seurat::VlnPlot(sobj, features = "percent.rb", pt.size = 0.001,
                group.by = "cell_type", cols = color_markers) +
  ggplot2::scale_fill_manual(values = color_markers, breaks = names(color_markers)) +
  ggplot2::geom_hline(yintercept = cut_percent.rb, col = "red") +
  ggplot2::labs(x = "")

sobj$fail = ifelse(colnames(sobj) %in% fail_percent.rb,
                   yes = as.character(sobj$cell_type), no = NA)
sobj$fail = factor(sobj$fail, levels = c(levels(sobj$cell_type), NA))

Seurat::DimPlot(sobj, group.by = "fail", na.value = "gray80", cols = color_markers) +
  ggplot2::labs(title = "percent.rb",
                subtitle = paste0(length(fail_percent.rb), " cells")) +
  Seurat::NoAxes() +
  ggplot2::theme(aspect.ratio = 1,
                 plot.title = element_text(hjust = 0.5),
                 plot.subtitle = element_text(hjust = 0.5))

FACS-like figure

We would like to see if the number of feature expressed by cell, and the number of UMI is correlated with the cell type, the percentage of mitochondrial and ribosomal gene expressed, and the doublet status. We build the log_nCount_RNA by nFeature_RNA figure, where cells (dots) are colored by these different metrics.

This is the figure, colored by cell type :

aquarius::plot_qc_facslike(df = sobj@meta.data,
                           x = "nFeature_RNA",
                           y = "log_nCount_RNA",
                           col_by = "cell_type",
                           col_colors = unname(color_markers),
                           x_thresh = cut_nFeature_RNA,
                           y_thresh = cut_log_nCount_RNA,
                           bins = 200)

This is the figure, colored by the percentage of mitochondrial genes expressed in cell :

aquarius::plot_qc_facslike(df = sobj@meta.data,
                           x = "nFeature_RNA",
                           y = "log_nCount_RNA",
                           col_by = "percent.mt",
                           x_thresh = cut_nFeature_RNA,
                           y_thresh = cut_log_nCount_RNA,
                           bins = 200)

This is the figure, colored by the percentage of ribosomal genes expressed in cell :

aquarius::plot_qc_facslike(df = sobj@meta.data,
                           x = "nFeature_RNA",
                           y = "log_nCount_RNA",
                           col_by = "percent.rb",
                           x_thresh = cut_nFeature_RNA,
                           y_thresh = cut_log_nCount_RNA,
                           bins = 200)

This is the figure, colored by the doublet cells status (doublets_consensus.class) :

aquarius::plot_qc_facslike(df = sobj@meta.data,
                           x = "nFeature_RNA",
                           y = "log_nCount_RNA",
                           col_by = "doublets_consensus.class",
                           col_colors = setNames(nm = c(TRUE, FALSE),
                                                 aquarius::gg_color_hue(2)),
                           x_thresh = cut_nFeature_RNA,
                           y_thresh = cut_log_nCount_RNA,
                           bins = 200)

This is the figure, colored by the doublet cells status (scDblFinder.class) :

aquarius::plot_qc_facslike(df = sobj@meta.data,
                           x = "nFeature_RNA",
                           y = "log_nCount_RNA",
                           col_by = "scDblFinder.class",
                           col_colors = setNames(nm = c(TRUE, FALSE),
                                                 aquarius::gg_color_hue(2)),
                           x_thresh = cut_nFeature_RNA,
                           y_thresh = cut_log_nCount_RNA,
                           bins = 200)

This is the figure, colored by the doublet cells status (hybrid_score.class) :

aquarius::plot_qc_facslike(df = sobj@meta.data,
                           x = "nFeature_RNA",
                           y = "log_nCount_RNA",
                           col_by = "hybrid_score.class",
                           col_colors = setNames(nm = c(TRUE, FALSE),
                                                 aquarius::gg_color_hue(2)),
                           x_thresh = cut_nFeature_RNA,
                           y_thresh = cut_log_nCount_RNA,
                           bins = 200)

Visualization as piechart

Do filtered cells belong to a particular cell type ?

sobj$all_cells = TRUE

plot_list = list()

## All cells
df = sobj@meta.data
if (nrow(df) == 0) {
  plot_list[[1]] = ggplot()
} else {
  plot_list[[1]] = aquarius::plot_piechart(df = df,
                                           logical_var = "all_cells",
                                           grouping_var = "cell_type",
                                           colors = color_markers,
                                           display_legend = TRUE) +
    ggplot2::labs(title = "All cells",
                  subtitle = paste(nrow(df), "cells")) +
    ggplot2::theme(plot.title = element_text(hjust = 0.5, face = "bold"),
                   plot.subtitle = element_text(hjust = 0.5))
}

## Doublets consensus
df = sobj@meta.data %>%
  dplyr::filter(doublets_consensus.class)
if (nrow(df) == 0) {
  plot_list[[2]] = ggplot()
} else {
  plot_list[[2]] = aquarius::plot_piechart(df = df,
                                           logical_var = "all_cells",
                                           grouping_var = "cell_type",
                                           colors = color_markers,
                                           display_legend = TRUE) +
    ggplot2::labs(title = "doublets_consensus.class",
                  subtitle = paste(sum(sobj$doublets_consensus.class, na.rm = TRUE), "cells")) +
    ggplot2::theme(plot.title = element_text(hjust = 0.5, face = "bold"),
                   plot.subtitle = element_text(hjust = 0.5))
}

## percent.mt
df = sobj@meta.data %>%
  dplyr::filter(percent.mt > cut_percent.mt)
if (nrow(df) == 0) {
  plot_list[[3]] = ggplot()
} else {
  plot_list[[3]] = aquarius::plot_piechart(df = df,
                                           logical_var = "all_cells",
                                           grouping_var = "cell_type",
                                           colors = color_markers,
                                           display_legend = TRUE) +
    ggplot2::labs(title = paste("percent.mt >", cut_percent.mt),
                  subtitle = paste(length(fail_percent.mt), "cells")) +
    ggplot2::theme(plot.title = element_text(hjust = 0.5, face = "bold"),
                   plot.subtitle = element_text(hjust = 0.5))
}

## percent.rb
df = sobj@meta.data %>%
  dplyr::filter(percent.rb > cut_percent.rb)
if (nrow(df) == 0) {
  plot_list[[4]] = ggplot()
} else {
  plot_list[[4]] = aquarius::plot_piechart(df = df,
                                           logical_var = "all_cells",
                                           grouping_var = "cell_type",
                                           colors = color_markers,
                                           display_legend = TRUE) +
    ggplot2::labs(title = paste("percent.rb >", cut_percent.rb),
                  subtitle = paste(length(fail_percent.rb), "cells")) +
    ggplot2::theme(plot.title = element_text(hjust = 0.5, face = "bold"),
                   plot.subtitle = element_text(hjust = 0.5))
}

## log_nCount_RNA
df = sobj@meta.data %>%
  dplyr::filter(log_nCount_RNA < cut_log_nCount_RNA)
if (nrow(df) == 0) {
  plot_list[[5]] = ggplot()
} else {
  plot_list[[5]] = aquarius::plot_piechart(df = df,
                                           logical_var = "all_cells",
                                           grouping_var = "cell_type",
                                           colors = color_markers,
                                           display_legend = TRUE) +
    ggplot2::labs(title = paste("log_nCount_RNA <", round(cut_log_nCount_RNA, 2)),
                  subtitle = paste(length(fail_log_nCount_RNA), "cells")) +
    ggplot2::theme(plot.title = element_text(hjust = 0.5, face = "bold"),
                   plot.subtitle = element_text(hjust = 0.5))
}

## nFeature_RNA
df = sobj@meta.data %>%
  dplyr::filter(nFeature_RNA < cut_nFeature_RNA)
if (nrow(df) == 0) {
  plot_list[[6]] = ggplot()
} else {
  plot_list[[6]] = aquarius::plot_piechart(df = df,
                                           logical_var = "all_cells",
                                           grouping_var = "cell_type",
                                           colors = color_markers,
                                           display_legend = TRUE) +
    ggplot2::labs(title = paste("nFeature_RNA <", round(cut_nFeature_RNA, 2)),
                  subtitle = paste(length(fail_nFeature_RNA), "cells")) +
    ggplot2::theme(plot.title = element_text(hjust = 0.5, face = "bold"),
                   plot.subtitle = element_text(hjust = 0.5))
}

patchwork::wrap_plots(plot_list, ncol = 3) +
  patchwork::plot_layout(guides = "collect") &
  ggplot2::theme(legend.position = "right")

Doublet cells status

We can compare doublet detection methods with a Venn diagram :

ggvenn::ggvenn(list(hybrid = fail_doublets_hybrid,
                    scDblFinder = fail_doublets_scDblFinder), 
               fill_color = c("#0073C2FF", "#EFC000FF"),
               stroke_size = 0.5, set_name_size = 4) +
  ggplot2::ggtitle(label = "Doublet cells") +
  ggplot2::theme(plot.title = element_text(hjust = 0.5, face = "bold"))

We visualize cells annotation for doublets :

plot_list = list()

# scDblFinder.class
sobj$fail = ifelse(sobj$scDblFinder.class,
                   yes = as.character(sobj$cell_type), no = NA)
sobj$fail = factor(sobj$fail, levels = c(levels(sobj$cell_type), NA))

plot_list[[1]] = Seurat::DimPlot(sobj, group.by = "fail",
                                 na.value = "gray80", cols = color_markers) +
  ggplot2::labs(title = "scDblFinder.class",
                subtitle = paste0(sum(sobj$scDblFinder.class, na.rm = TRUE), " cells")) +
  Seurat::NoAxes() + Seurat::NoLegend() +
  ggplot2::theme(aspect.ratio = 1,
                 plot.title = element_text(hjust = 0.5),
                 plot.subtitle = element_text(hjust = 0.5))

# hybrid_score.class
sobj$fail = ifelse(sobj$hybrid_score.class,
                   yes = as.character(sobj$cell_type), no = NA)
sobj$fail = factor(sobj$fail, levels = c(levels(sobj$cell_type), NA))

plot_list[[2]] = Seurat::DimPlot(sobj, group.by = "fail",
                                 na.value = "gray80", cols = color_markers) +
  ggplot2::labs(title = "hybrid_score.class",
                subtitle = paste0(sum(sobj$hybrid_score.class, na.rm = TRUE), " cells")) +
  Seurat::NoAxes() +
  ggplot2::theme(aspect.ratio = 1,
                 plot.title = element_text(hjust = 0.5),
                 plot.subtitle = element_text(hjust = 0.5))

sobj$fail = NULL

# Plot
patchwork::wrap_plots(plot_list, nrow = 1)

Save

We could save this object before filtering (remove eval = FALSE) :

saveRDS(sobj, paste0(out_dir, "/datasets/", sample_name, "_sobj_unfiltered.rds"))

Filtering

We remove :

  • cells with a number of UMI lower than 6
  • cells expressing a number of genes lower than 500
  • cells having more than 20 % of UMI related to mitochondrial genes
  • cells having more than 50 % of UMI related to ribosomal genes
  • doublet cells detected with both method (scDblFinder and scds-hybrid)
sobj = subset(sobj, invert = TRUE,
              cells = unique(c(fail_log_nCount_RNA, fail_nFeature_RNA,
                               fail_percent.mt, fail_percent.rb,
                               fail_doublets_consensus)))
sobj
## An object of class Seurat 
## 27955 features across 602 samples within 1 assay 
## Active assay: RNA (27955 features, 3000 variable features)
##  2 dimensional reductions calculated: RNA_pca, RNA_pca_20_tsne

Post-filtering processing

Normalization

We normalize the count matrix for remaining cells :

sobj = Seurat::NormalizeData(sobj,
                             normalization.method = "LogNormalize",
                             assay = "RNA")

sobj = Seurat::FindVariableFeatures(sobj,
                                    assay = "RNA",
                                    nfeatures = 3000)
sobj
## An object of class Seurat 
## 27955 features across 602 samples within 1 assay 
## Active assay: RNA (27955 features, 3000 variable features)
##  2 dimensional reductions calculated: RNA_pca, RNA_pca_20_tsne

Projection

We perform a PCA :

sobj = aquarius::dimensions_reduction(sobj = sobj,
                                      assay = "RNA",
                                      reduction = "pca",
                                      max_dims = 100,
                                      verbose = FALSE)
Seurat::ElbowPlot(sobj, ndims = 100, reduction = "RNA_pca")

We generate a tSNE and a UMAP with 20 principal components :

ndims = 20
sobj = Seurat::RunTSNE(sobj,
                       reduction = "RNA_pca",
                       dims = 1:ndims,
                       seed.use = 1337L,
                       reduction.name = paste0("RNA_pca_", ndims, "_tsne"))

sobj = Seurat::RunUMAP(sobj,
                       reduction = "RNA_pca",
                       dims = 1:ndims,
                       seed.use = 1337L,
                       reduction.name = paste0("RNA_pca_", ndims, "_umap"))

Annotation

We annotate cells for cell type, with the new normalized expression matrix :

score_columns = grep(x = colnames(sobj@meta.data), pattern = "^score", value = TRUE)
sobj@meta.data[, score_columns] = NULL
sobj$cell_type = NULL

sobj = aquarius::cell_annot_custom(sobj,
                                   newname = "cell_type",
                                   markers = cell_markers,
                                   use_negative = TRUE,
                                   add_score = TRUE,
                                   verbose = TRUE)

sobj$cell_type = factor(sobj$cell_type, levels = names(cell_markers))

colnames(sobj@meta.data) = stringr::str_replace_all(string = colnames(sobj@meta.data),
                                                    pattern = " ",
                                                    replacement = "_")

table(sobj$cell_type)
## 
##          CD4 T cells          CD8 T cells     Langerhans cells 
##                   63                   45                   16 
##          macrophages              B cells              cuticle 
##                   38                   21                   35 
##               cortex              medulla                  IRS 
##                    5                    9                   13 
##        proliferative               HF-SCs            IFE basal 
##                   56                   19                  121 
## IFE granular spinous                  ORS          melanocytes 
##                   22                   48                   76 
##            sebocytes 
##                   15

(Time to run : 5.19 s)

To justify cell type annotation, we can make a dotplot :

markers = c("PTPRC", unique(unlist(dotplot_markers[levels(sobj$cell_type)])))
markers = markers[markers %in% rownames(sobj)]

aquarius::plot_dotplot(sobj, assay = "RNA",
                       column_name = "cell_type",
                       markers = markers,
                       nb_hline = 0) +
  ggplot2::scale_color_gradientn(colors = aquarius:::color_gene) +
  ggplot2::theme(legend.position = "right",
                 legend.box = "vertical",
                 legend.direction = "vertical",
                 axis.title = element_blank(),
                 axis.text = element_text(size = 15))

We can make a barplot to see the composition of each dataset, and visualize cell types on the projection.

df_proportion = as.data.frame(prop.table(table(sobj$orig.ident,
                                               sobj$cell_type)))
colnames(df_proportion) = c("orig.ident", "cell_type", "freq")

quantif = table(sobj$orig.ident) %>%
  as.data.frame.table() %>%
  `colnames<-`(c("orig.ident", "nb_cells"))

# Plot
plot_list = list()

plot_list[[2]] = aquarius::plot_barplot(df = df_proportion,
                                        x = "orig.ident",
                                        y = "freq",
                                        fill = "cell_type",
                                        position = ggplot2::position_fill()) +
  ggplot2::scale_fill_manual(name = "Cell type",
                             values = color_markers[levels(df_proportion$cell_type)],
                             breaks = levels(df_proportion$cell_type)) +
  ggplot2::geom_label(data = quantif, inherit.aes = FALSE,
                      aes(x = orig.ident, y = 1.05, label = nb_cells),
                      label.size = 0)

plot_list[[1]] = Seurat::DimPlot(sobj, group.by = "cell_type",
                                 reduction = "RNA_pca_20_tsne") +
  ggplot2::scale_color_manual(values = unlist(color_markers),
                              breaks = names(color_markers)) +
  ggplot2::labs(title = sample_name,
                subtitle = paste0(ncol(sobj), " cells")) +
  Seurat::NoLegend() + Seurat::NoAxes() +
  ggplot2::theme(aspect.ratio = 1,
                 plot.title = element_text(hjust = 0.5),
                 plot.subtitle = element_text(hjust = 0.5))

patchwork::wrap_plots(plot_list, nrow = 1, widths = c(6, 1))

Cell cycle

We annotate cells for cell cycle phase :

cc_columns = aquarius::add_cell_cycle(sobj = sobj,
                                      assay = "RNA",
                                      species_rdx = "hs",
                                      BPPARAM = cl)@meta.data[, c("Seurat.Phase", "Phase")]
## 
##  G1 G2M   S 
## 413  50 139
sobj$Seurat.Phase = cc_columns$Seurat.Phase
sobj$cyclone.Phase = cc_columns$Phase

table(sobj$Seurat.Phase, sobj$cyclone.Phase)
##      
##        G1 G2M   S
##   G1  337  22  96
##   G2M  26  27  11
##   S    50   1  32

(Time to run : 60.03 s)

We visualize cell cycle on the projection :

plot_list = list()

plot_list[[2]] = Seurat::DimPlot(sobj, group.by = "Seurat.Phase",
                                 reduction = "RNA_pca_20_tsne") +
  ggplot2::labs(title = "Cell Cycle Phase",
                subtitle = "Seurat.Phase") +
  Seurat::NoLegend() + Seurat::NoAxes() +
  ggplot2::theme(aspect.ratio = 1,
                 plot.title = element_text(hjust = 0.5),
                 plot.subtitle = element_text(hjust = 0.5))

plot_list[[1]] = Seurat::DimPlot(sobj, group.by = "cyclone.Phase",
                                 reduction = "RNA_pca_20_tsne") +
  ggplot2::labs(title = "Cell Cycle Phase",
                subtitle = "cyclone.Phase") +
  Seurat::NoLegend() + Seurat::NoAxes() +
  ggplot2::theme(aspect.ratio = 1,
                 plot.title = element_text(hjust = 0.5),
                 plot.subtitle = element_text(hjust = 0.5))

patchwork::wrap_plots(plot_list, nrow = 1)

Clustering

We make a highly resolutive clustering :

sobj = Seurat::FindNeighbors(sobj, reduction = "RNA_pca", dims = c(1:ndims))
sobj = Seurat::FindClusters(sobj, resolution = 2)
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 602
## Number of edges: 14737
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.7282
## Number of communities: 12
## Elapsed time: 0 seconds
table(sobj$seurat_clusters)
## 
##  0  1  2  3  4  5  6  7  8  9 10 11 
## 74 67 60 59 55 55 49 49 47 42 23 22

Visualization

Cell type

We can visualize the cell type :

tsne = Seurat::DimPlot(sobj, group.by = "cell_type",
                       reduction = paste0("RNA_pca_", ndims, "_tsne"), cols = color_markers) +
  Seurat::NoAxes() + ggplot2::ggtitle("tSNE") +
  ggplot2::theme(aspect.ratio = 1,
                 plot.title = element_text(hjust = 0.5),
                 legend.position = "none")

umap = Seurat::DimPlot(sobj, group.by = "cell_type",
                       reduction = paste0("RNA_pca_", ndims, "_umap"), cols = color_markers) +
  Seurat::NoAxes() + ggplot2::ggtitle("UMAP") +
  ggplot2::theme(aspect.ratio = 1,
                 plot.title = element_text(hjust = 0.5))

tsne | umap

Cell cycle

We can visualize the cell cycle, from Seurat :

tsne = Seurat::DimPlot(sobj, group.by = "Seurat.Phase",
                       reduction = paste0("RNA_pca_", ndims, "_tsne")) +
  Seurat::NoAxes() + ggplot2::ggtitle("tSNE") +
  ggplot2::theme(aspect.ratio = 1,
                 plot.title = element_text(hjust = 0.5),
                 legend.position = "none")

umap = Seurat::DimPlot(sobj, group.by = "Seurat.Phase",
                       reduction = paste0("RNA_pca_", ndims, "_umap")) +
  Seurat::NoAxes() + ggplot2::ggtitle("UMAP") +
  ggplot2::theme(aspect.ratio = 1,
                 plot.title = element_text(hjust = 0.5))

tsne | umap

We can visualize the cell cycle, from cyclone :

tsne = Seurat::DimPlot(sobj, group.by = "cyclone.Phase",
                       reduction = paste0("RNA_pca_", ndims, "_tsne")) +
  Seurat::NoAxes() + ggplot2::ggtitle("tSNE") +
  ggplot2::theme(aspect.ratio = 1,
                 plot.title = element_text(hjust = 0.5),
                 legend.position = "none")

umap = Seurat::DimPlot(sobj, group.by = "cyclone.Phase",
                       reduction = paste0("RNA_pca_", ndims, "_umap")) +
  Seurat::NoAxes() + ggplot2::ggtitle("UMAP") +
  ggplot2::theme(aspect.ratio = 1,
                 plot.title = element_text(hjust = 0.5))

tsne | umap

Clusters

We visualize the clustering :

tsne = Seurat::DimPlot(sobj, group.by = "seurat_clusters", label = TRUE,
                       reduction = paste0("RNA_pca_", ndims, "_tsne")) +
  Seurat::NoAxes() + ggplot2::ggtitle("tSNE") +
  ggplot2::theme(aspect.ratio = 1,
                 plot.title = element_text(hjust = 0.5),
                 legend.position = "none")

umap = Seurat::DimPlot(sobj, group.by = "seurat_clusters", label = TRUE,
                       reduction = paste0("RNA_pca_", ndims, "_umap")) +
  Seurat::NoAxes() + ggplot2::ggtitle("UMAP") +
  ggplot2::theme(aspect.ratio = 1,
                 plot.title = element_text(hjust = 0.5))

tsne | umap

Gene expression

We visualize all cell types markers on the tSNE :

markers = dotplot_markers %>% unlist() %>% unname()
markers = markers[markers %in% rownames(sobj)]

plot_list = lapply(markers,
                   FUN = function(one_gene) {
                     p = Seurat::FeaturePlot(sobj, features = one_gene,
                                             reduction = paste0("RNA_pca_", ndims, "_tsne")) +
                       ggplot2::labs(title = one_gene) +
                       ggplot2::scale_color_gradientn(colors = aquarius::color_gene) +
                       ggplot2::theme(aspect.ratio = 1,
                                      plot.subtitle = element_text(hjust = 0.5)) +
                       Seurat::NoAxes()
                     return(p)
                   })

patchwork::wrap_plots(plot_list, ncol = 4)

Save

We save the annotated and filtered Seurat object :

saveRDS(sobj, file = paste0(out_dir, "/datasets/", sample_name, "_sobj_filtered.rds"))

R session

show
## R version 3.6.3 (2020-02-29)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 20.04.6 LTS
## 
## Matrix products: default
## BLAS:   /usr/local/lib/R/lib/libRblas.so
## LAPACK: /usr/local/lib/R/lib/libRlapack.so
## 
## locale:
## [1] C
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
## [1] ggplot2_3.3.5   patchwork_1.1.2 dplyr_1.0.7    
## 
## loaded via a namespace (and not attached):
##   [1] softImpute_1.4              graphlayouts_0.7.0         
##   [3] pbapply_1.4-2               lattice_0.20-41            
##   [5] haven_2.3.1                 vctrs_0.3.8                
##   [7] usethis_2.0.1               dynwrap_1.2.1              
##   [9] blob_1.2.1                  survival_3.2-13            
##  [11] prodlim_2019.11.13          dynutils_1.0.5             
##  [13] later_1.3.0                 DBI_1.1.1                  
##  [15] R.utils_2.11.0              SingleCellExperiment_1.8.0 
##  [17] rappdirs_0.3.3              uwot_0.1.8                 
##  [19] dqrng_0.2.1                 jpeg_0.1-8.1               
##  [21] zlibbioc_1.32.0             pspline_1.0-18             
##  [23] pcaMethods_1.78.0           mvtnorm_1.1-1              
##  [25] htmlwidgets_1.5.4           GlobalOptions_0.1.2        
##  [27] future_1.22.1               UpSetR_1.4.0               
##  [29] laeken_0.5.2                leiden_0.3.3               
##  [31] clustree_0.4.3              parallel_3.6.3             
##  [33] scater_1.14.6               irlba_2.3.3                
##  [35] DEoptimR_1.0-9              tidygraph_1.1.2            
##  [37] Rcpp_1.0.9                  readr_2.0.2                
##  [39] KernSmooth_2.23-17          carrier_0.1.0              
##  [41] promises_1.1.0              gdata_2.18.0               
##  [43] DelayedArray_0.12.3         limma_3.42.2               
##  [45] graph_1.64.0                RcppParallel_5.1.4         
##  [47] Hmisc_4.4-0                 fs_1.5.2                   
##  [49] RSpectra_0.16-0             fastmatch_1.1-0            
##  [51] ranger_0.12.1               digest_0.6.25              
##  [53] png_0.1-7                   sctransform_0.2.1          
##  [55] cowplot_1.0.0               DOSE_3.12.0                
##  [57] ggvenn_0.1.9                here_1.0.1                 
##  [59] TInGa_0.0.0.9000            ggraph_2.0.3               
##  [61] pkgconfig_2.0.3             GO.db_3.10.0               
##  [63] DelayedMatrixStats_1.8.0    gower_0.2.1                
##  [65] ggbeeswarm_0.6.0            iterators_1.0.12           
##  [67] DropletUtils_1.6.1          reticulate_1.26            
##  [69] clusterProfiler_3.14.3      SummarizedExperiment_1.16.1
##  [71] circlize_0.4.15             beeswarm_0.4.0             
##  [73] GetoptLong_1.0.5            xfun_0.35                  
##  [75] bslib_0.3.1                 zoo_1.8-10                 
##  [77] tidyselect_1.1.0            reshape2_1.4.4             
##  [79] purrr_0.3.4                 ica_1.0-2                  
##  [81] pcaPP_1.9-73                viridisLite_0.3.0          
##  [83] rtracklayer_1.46.0          rlang_1.0.2                
##  [85] hexbin_1.28.1               jquerylib_0.1.4            
##  [87] dyneval_0.9.9               glue_1.4.2                 
##  [89] RColorBrewer_1.1-2          matrixStats_0.56.0         
##  [91] stringr_1.4.0               lava_1.6.7                 
##  [93] europepmc_0.3               DESeq2_1.26.0              
##  [95] recipes_0.1.17              labeling_0.3               
##  [97] httpuv_1.5.2                class_7.3-17               
##  [99] BiocNeighbors_1.4.2         DO.db_2.9                  
## [101] annotate_1.64.0             jsonlite_1.7.2             
## [103] XVector_0.26.0              bit_4.0.4                  
## [105] mime_0.9                    aquarius_0.1.5             
## [107] Rsamtools_2.2.3             gridExtra_2.3              
## [109] gplots_3.0.3                stringi_1.4.6              
## [111] processx_3.5.2              gsl_2.1-6                  
## [113] bitops_1.0-6                cli_3.0.1                  
## [115] batchelor_1.2.4             RSQLite_2.2.0              
## [117] randomForest_4.6-14         tidyr_1.1.4                
## [119] data.table_1.14.2           rstudioapi_0.13            
## [121] org.Mm.eg.db_3.10.0         GenomicAlignments_1.22.1   
## [123] nlme_3.1-147                qvalue_2.18.0              
## [125] scran_1.14.6                locfit_1.5-9.4             
## [127] scDblFinder_1.1.8           listenv_0.8.0              
## [129] ggthemes_4.2.4              gridGraphics_0.5-0         
## [131] R.oo_1.24.0                 dbplyr_1.4.4               
## [133] BiocGenerics_0.32.0         TTR_0.24.2                 
## [135] readxl_1.3.1                lifecycle_1.0.1            
## [137] timeDate_3043.102           ggpattern_0.3.1            
## [139] munsell_0.5.0               cellranger_1.1.0           
## [141] R.methodsS3_1.8.1           proxyC_0.1.5               
## [143] visNetwork_2.0.9            caTools_1.18.0             
## [145] codetools_0.2-16            Biobase_2.46.0             
## [147] GenomeInfoDb_1.22.1         vipor_0.4.5                
## [149] lmtest_0.9-38               msigdbr_7.5.1              
## [151] htmlTable_1.13.3            triebeard_0.3.0            
## [153] lsei_1.2-0                  xtable_1.8-4               
## [155] ROCR_1.0-7                  BiocManager_1.30.10        
## [157] scatterplot3d_0.3-41        abind_1.4-5                
## [159] farver_2.0.3                parallelly_1.28.1          
## [161] RANN_2.6.1                  askpass_1.1                
## [163] GenomicRanges_1.38.0        RcppAnnoy_0.0.16           
## [165] tibble_3.1.5                ggdendro_0.1-20            
## [167] cluster_2.1.0               future.apply_1.5.0         
## [169] Seurat_3.1.5                dendextend_1.15.1          
## [171] Matrix_1.3-2                ellipsis_0.3.2             
## [173] prettyunits_1.1.1           lubridate_1.7.9            
## [175] ggridges_0.5.2              igraph_1.2.5               
## [177] RcppEigen_0.3.3.7.0         fgsea_1.12.0               
## [179] remotes_2.4.2               scBFA_1.0.0                
## [181] destiny_3.0.1               VIM_6.1.1                  
## [183] testthat_3.1.0              htmltools_0.5.2            
## [185] BiocFileCache_1.10.2        yaml_2.2.1                 
## [187] utf8_1.1.4                  plotly_4.9.2.1             
## [189] XML_3.99-0.3                ModelMetrics_1.2.2.2       
## [191] e1071_1.7-3                 foreign_0.8-76             
## [193] withr_2.5.0                 fitdistrplus_1.0-14        
## [195] BiocParallel_1.20.1         xgboost_1.4.1.1            
## [197] bit64_4.0.5                 foreach_1.5.0              
## [199] robustbase_0.93-9           Biostrings_2.54.0          
## [201] GOSemSim_2.13.1             rsvd_1.0.3                 
## [203] memoise_2.0.0               evaluate_0.18              
## [205] forcats_0.5.0               rio_0.5.16                 
## [207] geneplotter_1.64.0          tzdb_0.1.2                 
## [209] caret_6.0-86                ps_1.6.0                   
## [211] DiagrammeR_1.0.6.1          curl_4.3                   
## [213] fdrtool_1.2.15              fansi_0.4.1                
## [215] highr_0.8                   urltools_1.7.3             
## [217] xts_0.12.1                  GSEABase_1.48.0            
## [219] acepack_1.4.1               edgeR_3.28.1               
## [221] checkmate_2.0.0             scds_1.2.0                 
## [223] cachem_1.0.6                npsurv_0.4-0               
## [225] babelgene_22.3              rjson_0.2.20               
## [227] openxlsx_4.1.5              ggrepel_0.9.1              
## [229] clue_0.3-60                 rprojroot_2.0.2            
## [231] stabledist_0.7-1            tools_3.6.3                
## [233] sass_0.4.0                  nichenetr_1.1.1            
## [235] magrittr_2.0.1              RCurl_1.98-1.2             
## [237] proxy_0.4-24                car_3.0-11                 
## [239] ape_5.3                     ggplotify_0.0.5            
## [241] xml2_1.3.2                  httr_1.4.2                 
## [243] assertthat_0.2.1            rmarkdown_2.18             
## [245] boot_1.3-25                 globals_0.14.0             
## [247] R6_2.4.1                    Rhdf5lib_1.8.0             
## [249] nnet_7.3-14                 RcppHNSW_0.2.0             
## [251] progress_1.2.2              genefilter_1.68.0          
## [253] statmod_1.4.34              gtools_3.8.2               
## [255] shape_1.4.6                 HDF5Array_1.14.4           
## [257] BiocSingular_1.2.2          rhdf5_2.30.1               
## [259] splines_3.6.3               AUCell_1.8.0               
## [261] carData_3.0-4               colorspace_1.4-1           
## [263] generics_0.1.0              stats4_3.6.3               
## [265] base64enc_0.1-3             dynfeature_1.0.0           
## [267] smoother_1.1                gridtext_0.1.1             
## [269] pillar_1.6.3                tweenr_1.0.1               
## [271] sp_1.4-1                    ggplot.multistats_1.0.0    
## [273] rvcheck_0.1.8               GenomeInfoDbData_1.2.2     
## [275] plyr_1.8.6                  gtable_0.3.0               
## [277] zip_2.2.0                   knitr_1.41                 
## [279] ComplexHeatmap_2.14.0       latticeExtra_0.6-29        
## [281] biomaRt_2.42.1              IRanges_2.20.2             
## [283] fastmap_1.1.0               ADGofTest_0.3              
## [285] copula_1.0-0                doParallel_1.0.15          
## [287] AnnotationDbi_1.48.0        vcd_1.4-8                  
## [289] babelwhale_1.0.1            openssl_1.4.1              
## [291] scales_1.1.1                backports_1.2.1            
## [293] S4Vectors_0.24.4            ipred_0.9-12               
## [295] enrichplot_1.6.1            hms_1.1.1                  
## [297] ggforce_0.3.1               Rtsne_0.15                 
## [299] shiny_1.7.1                 numDeriv_2016.8-1.1        
## [301] polyclip_1.10-0             grid_3.6.3                 
## [303] lazyeval_0.2.2              Formula_1.2-3              
## [305] tsne_0.1-3                  crayon_1.3.4               
## [307] MASS_7.3-54                 pROC_1.16.2                
## [309] viridis_0.5.1               dynparam_1.0.0             
## [311] rpart_4.1-15                zinbwave_1.8.0             
## [313] compiler_3.6.3              ggtext_0.1.0
---
params:
  sample_name: "2021_31"
title: "HS project"
subtitle: "Sample `r params$sample_name`"
author: "Audrey"
date: "`r format(Sys.time(), '%Y-%m-%d')`"
output:
  html_document:
    code_folding: show
    code_download: true
    toc: true
    toc_float: true
    number_sections: false
---

<style>
body {
text-align: justify}
</style>

<!-- Automatically computes and prints in the output the running time for any code chunk -->
```{r, echo=FALSE}
# https://github.com/rstudio/rmarkdown/issues/1453
hooks = knitr::knit_hooks$get()
hook_foldable = function(type) {
  force(type)
  function(x, options) {
    res = hooks[[type]](x, options)
    
    if (isFALSE(options[[paste0("fold_", type)]])) return(res)
    
    paste0(
      "<details><summary>", "show", "</summary>\n\n",
      res,
      "\n\n</details>"
    )
  }
}
knitr::knit_hooks$set(
  output = hook_foldable("output"),
  plot = hook_foldable("plot"),
  time_it = local({
    now = NULL
    function(before, options) {
      if (options$time_it) {
        if (before) {
          now <<- Sys.time()
        } else {
          res = difftime(Sys.time(), now, units = "secs")
          paste("(Time to run :", round(res, digits = 2), "s)")
        }
      }
    }
  })
)
```

<!-- Set default parameters for all chunks -->
```{r, setup, include = FALSE}
set.seed(1337L)
knitr::opts_chunk$set(echo = TRUE, # display code
                      # display chunk output
                      message = FALSE,
                      warning = FALSE,
                      fold_output = FALSE, # usefull for sessionInfo()
                      fold_plot = FALSE,
                      
                      # figure settings
                      fig.align = 'center',
                      fig.width = 20,
                      fig.height = 15,
                      
                      # something about seed, chunk and Rmarkdown compilation
                      # https://stackoverflow.com/questions/39417003/long-vectors-not-supported-yet-error-in-rmd-but-not-in-r-script
                      # cache = TRUE,
                      cache.lazy = FALSE, 
                      
                      # add runtime after chunk
                      time_it = FALSE)
```


The goal of this script is to generate a Seurat object for sample `r params$sample_name`.

* removal of cells based on quality control metrics
* normalization with `LogNormalize`, then doublets detection using `scran hybrid` and `scDblFinder` method, and doublet cells removal
* normalization with `LogNormalize`, for only the remaining cells
* cell cycle and cell type annotation
* dimensionality reduction using `PCA`
* projection using `tSNE` and `UMAP`


```{r library}
library(dplyr)
library(patchwork)
library(ggplot2)

.libPaths()
```

# Preparation

In this section, we set the global settings of the analysis. We will store data there :

```{r out_dir}
out_dir = "."
```

We load the parameters :

```{r get_param}
sample_name = params$sample_name # "2021_31"
# sample_name = "2021_31"
```

Input count matrix is there :

```{r count_matrix_dir}
count_matrix_dir = paste0(out_dir, "/input/", sample_name)
list.files(count_matrix_dir)
```


We load the markers and specific colors for each cell type :

```{r cell_markers}
cell_markers = readRDS(paste0(out_dir, "/../1_metadata/hs_hd_cell_markers.rds"))
cell_markers = lapply(cell_markers, FUN = toupper)
lengths(cell_markers)
```

Here are custom colors for each cell type :

```{r color_markers, fig.width = 12, fig.height = 1, class.source = "fold-hide"}
color_markers = readRDS(paste0(out_dir, "/../1_metadata/hs_hd_color_markers.rds"))

data.frame(cell_type = names(color_markers),
           color = unlist(color_markers)) %>%
  ggplot2::ggplot(., aes(x = cell_type, y = 0, fill = cell_type)) +
  ggplot2::geom_point(pch = 21, size = 5) +
  ggplot2::scale_fill_manual(values = unlist(color_markers), breaks = names(color_markers)) +
  ggplot2::theme_classic() +
  ggplot2::theme(legend.position = "none",
                 axis.line = element_blank(),
                 axis.title = element_blank(),
                 axis.ticks = element_blank(),
                 axis.text.y = element_blank())
```


We load markers to display on the dotplot :

```{r dotplot_markers}
dotplot_markers = readRDS(paste0(out_dir, "/../1_metadata/hs_hd_dotplot_markers.rds"))
dotplot_markers = lapply(dotplot_markers, FUN = toupper)
dotplot_markers
```


We load metadata for this sample :

```{r sample_info}
sample_info = readRDS(paste0(out_dir, "/../1_metadata/hs_hd_sample_info.rds"))
sample_info %>%
  dplyr::filter(project_name == sample_name)
```


These is a parameter for different functions :

```{r global_settings}
cl = aquarius::create_parallel_instance(nthreads = 3L)
cut_log_nCount_RNA = 6
cut_nFeature_RNA = 500
cut_percent.mt = 20
cut_percent.rb = 50
```

# Load count matrix

In this section, we load the raw count matrix. Then, we applied an empty droplets filtering.

```{r load_count_matrix, time_it = TRUE}
sobj = aquarius::load_sc_data(data_path = count_matrix_dir,
                              sample_name = sample_name,
                              my_seed = 1337L)
sobj
```

In genes metadata, we add the Ensembl ID. The `sobj@assays$RNA@meta.features` dataframe contains three information :

* `rownames` : gene names stored as the dimnames of the count matrix. Duplicated gene names will have a `.1` at the end of their name
* `Ensembl_ID` : EnsemblID, as stored in the `features.tsv.gz` file
* `gene_name` : gene_name, as stored in the `features.tsv.gz` file. Duplicated gene names will have the same name.

```{r features_df}
features_df = read.csv(paste0(count_matrix_dir, "/features.tsv.gz"), sep = "\t", header = 0)
features_df = features_df[, c(1:2)]
colnames(features_df) = c("Ensembl_ID", "gene_name")
rownames(features_df) = rownames(sobj) # mandatory for Seurat::FindVariableFeatures

sobj@assays$RNA@meta.features = features_df
rm(features_df)

head(sobj@assays$RNA@meta.features)
```

We add the same columns as in metadata :

```{r add_metadata}
row_oi = (sample_info$project_name == sample_name)

sobj$project_name = sample_name
sobj$sample_identifier = sample_info[row_oi, "sample_identifier"]
sobj$sample_type = sample_info[row_oi, "sample_type"]
sobj$location = sample_info[row_oi, "location"]
sobj$laboratory = sample_info[row_oi, "laboratory"]

colnames(sobj@meta.data)
```

# Before filtering

## Normalization

```{r normalization}
sobj = Seurat::NormalizeData(sobj,
                             normalization.method = "LogNormalize",
                             assay = "RNA")

sobj = Seurat::FindVariableFeatures(sobj,
                                    assay = "RNA",
                                    nfeatures = 3000)
sobj
```

## Projection

We generate a tSNE to visualize cells before filtering.

```{r pca_before, fig.width = 12, fig.height = 4}
sobj = aquarius::dimensions_reduction(sobj = sobj,
                                      assay = "RNA",
                                      reduction = "pca",
                                      max_dims = 100,
                                      verbose = FALSE)
Seurat::ElbowPlot(sobj, ndims = 100, reduction = "RNA_pca")
```


We generate a tSNE with 20 principal components :

```{r tsne_before}
ndims = 20
sobj = Seurat::RunTSNE(sobj,
                       reduction = "RNA_pca",
                       dims = 1:ndims,
                       seed.use = 1337L,
                       reduction.name = paste0("RNA_pca_", ndims, "_tsne"))

sobj
```

## Cell type

We annotate cells for cell type using `Seurat::AddModuleScore` function.

```{r cell_annot_custom_short, time_it = TRUE}
sobj = aquarius::cell_annot_custom(sobj,
                                   newname = "cell_type",
                                   markers = cell_markers,
                                   use_negative = TRUE,
                                   add_score = TRUE,
                                   verbose = TRUE)

colnames(sobj@meta.data) = stringr::str_replace_all(string = colnames(sobj@meta.data),
                                                    pattern = " ",
                                                    replacement = "_")

sobj$cell_type = factor(sobj$cell_type, levels = names(cell_markers))

table(sobj$cell_type)
```

To justify cell type annotation, we can make a dotplot :

```{r dotplot_cell_type_short, fig.width = 12, fig.height = 9}
markers = c("PTPRC", "MSX2", "KRT16",
            unique(unlist(dotplot_markers[levels(sobj$cell_type)])))
markers = markers[markers %in% rownames(sobj)]

aquarius::plot_dotplot(sobj, assay = "RNA",
                       column_name = "cell_type",
                       markers = markers,
                       nb_hline = 0) +
  ggplot2::scale_color_gradientn(colors = aquarius:::color_gene) +
  ggplot2::theme(legend.position = "right",
                 legend.box = "vertical",
                 legend.direction = "vertical",
                 axis.title = element_blank(),
                 axis.text = element_text(size = 15))
```

We can make a barplot to see the composition of each dataset, and visualize cell types on the projection.

```{r barplot_celltype, fig.height = 6, fig.width = 8, class.source = "fold-hide"}
df_proportion = as.data.frame(prop.table(table(sobj$orig.ident,
                                               sobj$cell_type)))
colnames(df_proportion) = c("orig.ident", "cell_type", "freq")

quantif = table(sobj$orig.ident) %>%
  as.data.frame.table() %>%
  `colnames<-`(c("orig.ident", "nb_cells"))

# Plot
plot_list = list()

plot_list[[2]] = aquarius::plot_barplot(df = df_proportion,
                                        x = "orig.ident",
                                        y = "freq",
                                        fill = "cell_type",
                                        position = ggplot2::position_fill()) +
  ggplot2::scale_fill_manual(name = "Cell type",
                             values = color_markers[levels(df_proportion$cell_type)],
                             breaks = levels(df_proportion$cell_type)) +
  ggplot2::geom_label(data = quantif, inherit.aes = FALSE,
                      aes(x = orig.ident, y = 1.05, label = nb_cells),
                      label.size = 0)

plot_list[[1]] = Seurat::DimPlot(sobj, group.by = "cell_type") +
  ggplot2::scale_color_manual(values = unlist(color_markers),
                              breaks = names(color_markers)) +
  ggplot2::labs(title = sample_name,
                subtitle = paste0(ncol(sobj), " cells")) +
  Seurat::NoLegend() + Seurat::NoAxes() +
  ggplot2::theme(aspect.ratio = 1,
                 plot.title = element_text(hjust = 0.5),
                 plot.subtitle = element_text(hjust = 0.5))

patchwork::wrap_plots(plot_list, nrow = 1, widths = c(6, 1))
```

## Cell cycle phase

We annotate cells for cell cycle phase using `Seurat` and `cyclone`.

```{r cell_cycle, time_it = TRUE}
cc_columns = aquarius::add_cell_cycle(sobj = sobj,
                                      assay = "RNA",
                                      species_rdx = "hs",
                                      BPPARAM = cl)@meta.data[, c("Seurat.Phase", "Phase")]

sobj$Seurat.Phase = cc_columns$Seurat.Phase
sobj$cyclone.Phase = cc_columns$Phase

table(sobj$Seurat.Phase, sobj$cyclone.Phase)
```

We visualize cell cycle on the projection :

```{r see_cell_cycle1, fig.width = 12, fig.height = 7, class.source = "fold-hide"}
plot_list = list()

plot_list[[2]] = Seurat::DimPlot(sobj, group.by = "Seurat.Phase") +
  ggplot2::labs(title = "Cell Cycle Phase",
                subtitle = "Seurat.Phase") +
  Seurat::NoLegend() + Seurat::NoAxes() +
  ggplot2::theme(aspect.ratio = 1,
                 plot.title = element_text(hjust = 0.5),
                 plot.subtitle = element_text(hjust = 0.5))

plot_list[[1]] = Seurat::DimPlot(sobj, group.by = "cyclone.Phase") +
  ggplot2::labs(title = "Cell Cycle Phase",
                subtitle = "cyclone.Phase") +
  Seurat::NoLegend() + Seurat::NoAxes() +
  ggplot2::theme(aspect.ratio = 1,
                 plot.title = element_text(hjust = 0.5),
                 plot.subtitle = element_text(hjust = 0.5))

patchwork::wrap_plots(plot_list, nrow = 1)
```

# Quality control

In this section, we look at the number of genes expressed by each cell, the number of UMI, the percentage of mitochondrial genes expressed, and the percentage of ribosomal genes expressed. Then, without taking into account the cells expressing low number of genes or have low number of UMI, we identify doublet cells.

We compute four quality metrics :

```{r qc_metrics}
sobj = Seurat::PercentageFeatureSet(sobj, pattern = "^MT", col.name = "percent.mt")
sobj = Seurat::PercentageFeatureSet(sobj, pattern = "^RP[L|S][0-9]*$", col.name = "percent.rb")

head(sobj@meta.data)
```

We get the cell barcodes for the failing cells :

```{r failed}
fail_percent.mt = sobj@meta.data %>% dplyr::filter(percent.mt > cut_percent.mt) %>% rownames()
fail_percent.rb = sobj@meta.data %>% dplyr::filter(percent.rb > cut_percent.rb) %>% rownames()
fail_log_nCount_RNA = sobj@meta.data %>% dplyr::filter(log_nCount_RNA < cut_log_nCount_RNA) %>% rownames()
fail_nFeature_RNA = sobj@meta.data %>% dplyr::filter(nFeature_RNA < cut_nFeature_RNA) %>% rownames()
```


## Doublet cells detection

Without taking into account the low UMI and low number of features cells, we identify doublets.

```{r fsobj}
fsobj = subset(sobj, invert = TRUE,
               cells = unique(c(fail_log_nCount_RNA, fail_nFeature_RNA)))
fsobj
```

On this filtered dataset, we apply doublet cells detection. Just before, we run the normalization, taking into account only the remaining cells.

```{r normalization_doublets}
sobj = Seurat::NormalizeData(sobj,
                             normalization.method = "LogNormalize",
                             assay = "RNA")

sobj = Seurat::FindVariableFeatures(sobj,
                                    assay = "RNA",
                                    nfeatures = 3000)
sobj
```

We identify doublet cells :

```{r doublet_detection, time_it = TRUE}
fsobj = aquarius::find_doublets(sobj = fsobj,
                                BPPARAM = cl)

fail_doublets_consensus = Seurat::WhichCells(fsobj, expression = doublets_consensus.class)
fail_doublets_scDblFinder = Seurat::WhichCells(fsobj, expression = scDblFinder.class)
fail_doublets_hybrid = Seurat::WhichCells(fsobj, expression = hybrid_score.class)
```

We add the information in the non filtered Seurat object :

```{r add_doublet_metrics}
sobj$doublets_consensus.class = dplyr::case_when(!(colnames(sobj) %in% colnames(fsobj)) ~ NA,
                                                 colnames(sobj) %in% fail_doublets_consensus ~ TRUE,
                                                 !(colnames(sobj) %in% fail_doublets_consensus) ~ FALSE)

sobj$scDblFinder.class = dplyr::case_when(!(colnames(sobj) %in% colnames(fsobj)) ~ NA,
                                          colnames(sobj) %in% fail_doublets_scDblFinder ~ TRUE,
                                          !(colnames(sobj) %in% fail_doublets_scDblFinder) ~ FALSE)

sobj$hybrid_score.class = dplyr::case_when(!(colnames(sobj) %in% colnames(fsobj)) ~ NA,
                                           colnames(sobj) %in% fail_doublets_hybrid ~ TRUE,
                                           !(colnames(sobj) %in% fail_doublets_hybrid) ~ FALSE)
```

```{r clean_fsobj, echo = FALSE}
rm(fsobj)
```


## Quality control representation

We can visualize the 4 cells quality with a Venn diagram : 

```{r qc_venn, class.source = "fold-hide", fig.width = 8, fig.height = 6}
n_filtered = c(fail_percent.mt, fail_percent.rb, fail_log_nCount_RNA, fail_nFeature_RNA) %>%
  unique() %>% length()
percent_filtered = round(100*(n_filtered/ncol(sobj)), 2)

ggvenn::ggvenn(list(percent.mt = fail_percent.mt,
                    percent.rb = fail_percent.rb,
                    log_nCount_RNA = fail_log_nCount_RNA,
                    nFeature_RNA = fail_nFeature_RNA), 
               fill_color = c("#0073C2FF", "#EFC000FF", "orange", "pink"),
               stroke_size = 0.5, set_name_size = 4) +
  ggplot2::labs(title = "Filtered out cells",
                subtitle = paste0(n_filtered, " cells (", percent_filtered, " % of all cells)")) +
  ggplot2::theme(plot.title = element_text(hjust = 0.5, face = "bold"),
                 plot.subtitle = element_text(hjust = 0.5))
```


### Number of UMI

To visualize the threshold for number of UMI, we can make a histogram :

```{r qc_umi_hist, class.source = "fold-hide", fig.width = 10, fig.height = 5}
aquarius::plot_qc_density(df = sobj@meta.data,
                          x = "log_nCount_RNA",
                          bins = 200,
                          group_by = "orig.ident",
                          group_color = setNames(sample_info$color,
                                                 nm = sample_info$sample_identifiant),
                          x_thresh = cut_log_nCount_RNA)
```

```{r vln_umi_cell_type, fig.width = 12, fig.height = 6}
Seurat::VlnPlot(sobj, features = "log_nCount_RNA", pt.size = 0.001,
                group.by = "cell_type", cols = color_markers) +
  ggplot2::scale_fill_manual(values = color_markers, breaks = names(color_markers)) +
  ggplot2::geom_hline(yintercept = cut_log_nCount_RNA, col = "red") +
  ggplot2::labs(x = "")
```

```{r qc_umi_proj, fig.width = 8, fig.height = 7}
sobj$fail = ifelse(colnames(sobj) %in% fail_log_nCount_RNA,
                   yes = as.character(sobj$cell_type), no = NA)
sobj$fail = factor(sobj$fail, levels = c(levels(sobj$cell_type), NA))

Seurat::DimPlot(sobj, group.by = "fail", na.value = "gray80", cols = color_markers) +
  ggplot2::labs(title = "log_nCount_RNA",
                subtitle = paste0(length(fail_log_nCount_RNA), " cells")) +
  Seurat::NoAxes() +
  ggplot2::theme(aspect.ratio = 1,
                 plot.title = element_text(hjust = 0.5),
                 plot.subtitle = element_text(hjust = 0.5))
```

### Number of features

To visualize the threshold for number of features, we can make a histogram :

```{r qc_features_hist, class.source = "fold-hide", fig.width = 10, fig.height = 5}
aquarius::plot_qc_density(df = sobj@meta.data,
                          x = "nFeature_RNA",
                          bins = 200,
                          group_by = "orig.ident",
                          group_color = setNames(sample_info$color,
                                                 nm = sample_info$sample_identifiant),
                          x_thresh = cut_nFeature_RNA)
```


```{r vln_features_cell_type, fig.width = 12, fig.height = 6}
Seurat::VlnPlot(sobj, features = "nFeature_RNA", pt.size = 0.001,
                group.by = "cell_type", cols = color_markers) +
  ggplot2::scale_fill_manual(values = color_markers, breaks = names(color_markers)) +
  ggplot2::geom_hline(yintercept = cut_nFeature_RNA, col = "red") +
  ggplot2::labs(x = "")
```

```{r qc_features_proj, fig.width = 8, fig.height = 7}
sobj$fail = ifelse(colnames(sobj) %in% fail_nFeature_RNA,
                   yes = as.character(sobj$cell_type), no = NA)
sobj$fail = factor(sobj$fail, levels = c(levels(sobj$cell_type), NA))

Seurat::DimPlot(sobj, group.by = "fail", na.value = "gray80", cols = color_markers) +
  ggplot2::labs(title = "nFeature_RNA",
                subtitle = paste0(length(fail_nFeature_RNA), " cells")) +
  Seurat::NoAxes() +
  ggplot2::theme(aspect.ratio = 1,
                 plot.title = element_text(hjust = 0.5),
                 plot.subtitle = element_text(hjust = 0.5))
```

### Mitochondrial genes expression

To identify a threshold for mitochondrial gene expression, we can make a histogram :

```{r qc_mito_hist, class.source = "fold-hide", fig.width = 10, fig.height = 5}
aquarius::plot_qc_density(df = sobj@meta.data,
                          x = "percent.mt",
                          bins = 200,
                          group_by = "orig.ident",
                          group_color = setNames(sample_info$color,
                                                 nm = sample_info$sample_identifiant),
                          x_thresh = cut_percent.mt)
```

```{r vln_percentmt_cell_type, fig.width = 12, fig.height = 6}
Seurat::VlnPlot(sobj, features = "percent.mt", pt.size = 0.001,
                group.by = "cell_type", cols = color_markers) +
  ggplot2::scale_fill_manual(values = color_markers, breaks = names(color_markers)) +
  ggplot2::geom_hline(yintercept = cut_percent.mt, col = "red") +
  ggplot2::labs(x = "")
```

```{r qc_mito_proj, fig.width = 8, fig.height = 7}
sobj$fail = ifelse(colnames(sobj) %in% fail_percent.mt,
                   yes = as.character(sobj$cell_type), no = NA)
sobj$fail = factor(sobj$fail, levels = c(levels(sobj$cell_type), NA))

Seurat::DimPlot(sobj, group.by = "fail", na.value = "gray80", cols = color_markers) +
  ggplot2::labs(title = "percent.mt",
                subtitle = paste0(length(fail_percent.mt), " cells")) +
  Seurat::NoAxes() +
  ggplot2::theme(aspect.ratio = 1,
                 plot.title = element_text(hjust = 0.5),
                 plot.subtitle = element_text(hjust = 0.5))
```

### Ribosomal genes expression

To identify a threshold for ribosomal gene expression, we can make a histogram :

```{r qc_ribo_hist, class.source = "fold-hide", fig.width = 10, fig.height = 5}
aquarius::plot_qc_density(df = sobj@meta.data,
                          x = "percent.rb",
                          bins = 200,
                          group_by = "orig.ident",
                          group_color = setNames(sample_info$color,
                                                 nm = sample_info$sample_identifiant),
                          x_thresh = cut_percent.rb)
```


```{r vln_percentrb_cell_type, fig.width = 12, fig.height = 6}
Seurat::VlnPlot(sobj, features = "percent.rb", pt.size = 0.001,
                group.by = "cell_type", cols = color_markers) +
  ggplot2::scale_fill_manual(values = color_markers, breaks = names(color_markers)) +
  ggplot2::geom_hline(yintercept = cut_percent.rb, col = "red") +
  ggplot2::labs(x = "")
```

```{r qc_ribo_proj, fig.width = 8, fig.height = 7}
sobj$fail = ifelse(colnames(sobj) %in% fail_percent.rb,
                   yes = as.character(sobj$cell_type), no = NA)
sobj$fail = factor(sobj$fail, levels = c(levels(sobj$cell_type), NA))

Seurat::DimPlot(sobj, group.by = "fail", na.value = "gray80", cols = color_markers) +
  ggplot2::labs(title = "percent.rb",
                subtitle = paste0(length(fail_percent.rb), " cells")) +
  Seurat::NoAxes() +
  ggplot2::theme(aspect.ratio = 1,
                 plot.title = element_text(hjust = 0.5),
                 plot.subtitle = element_text(hjust = 0.5))
```

### FACS-like figure

We would like to see if the number of feature expressed by cell, and the number of UMI is correlated with the cell type, the percentage of mitochondrial and ribosomal gene expressed, and the doublet status. We build the `log_nCount_RNA` by `nFeature_RNA` figure, where cells (dots) are colored by these different metrics.

This is the figure, colored by cell type :

```{r qc_patchwork_cell_type, class.source = "fold-hide", fig.width = 10, fig.height = 8}
aquarius::plot_qc_facslike(df = sobj@meta.data,
                           x = "nFeature_RNA",
                           y = "log_nCount_RNA",
                           col_by = "cell_type",
                           col_colors = unname(color_markers),
                           x_thresh = cut_nFeature_RNA,
                           y_thresh = cut_log_nCount_RNA,
                           bins = 200)
```

This is the figure, colored by the percentage of mitochondrial genes expressed in cell :

```{r qc_patchwork_mito, class.source = "fold-hide", fig.width = 10, fig.height = 8}
aquarius::plot_qc_facslike(df = sobj@meta.data,
                           x = "nFeature_RNA",
                           y = "log_nCount_RNA",
                           col_by = "percent.mt",
                           x_thresh = cut_nFeature_RNA,
                           y_thresh = cut_log_nCount_RNA,
                           bins = 200)
```

This is the figure, colored by the percentage of ribosomal genes expressed in cell :

```{r qc_patchwork_ribo, class.source = "fold-hide", fig.width = 10, fig.height = 8}
aquarius::plot_qc_facslike(df = sobj@meta.data,
                           x = "nFeature_RNA",
                           y = "log_nCount_RNA",
                           col_by = "percent.rb",
                           x_thresh = cut_nFeature_RNA,
                           y_thresh = cut_log_nCount_RNA,
                           bins = 200)
```

This is the figure, colored by the doublet cells status (`doublets_consensus.class`) :

```{r qc_patchwork_doublet_consensus, class.source = "fold-hide", fig.width = 10, fig.height = 8}
aquarius::plot_qc_facslike(df = sobj@meta.data,
                           x = "nFeature_RNA",
                           y = "log_nCount_RNA",
                           col_by = "doublets_consensus.class",
                           col_colors = setNames(nm = c(TRUE, FALSE),
                                                 aquarius::gg_color_hue(2)),
                           x_thresh = cut_nFeature_RNA,
                           y_thresh = cut_log_nCount_RNA,
                           bins = 200)
```

This is the figure, colored by the doublet cells status (`scDblFinder.class`) :

```{r qc_patchwork_scdblfinder, class.source = "fold-hide", fig.width = 10, fig.height = 8}
aquarius::plot_qc_facslike(df = sobj@meta.data,
                           x = "nFeature_RNA",
                           y = "log_nCount_RNA",
                           col_by = "scDblFinder.class",
                           col_colors = setNames(nm = c(TRUE, FALSE),
                                                 aquarius::gg_color_hue(2)),
                           x_thresh = cut_nFeature_RNA,
                           y_thresh = cut_log_nCount_RNA,
                           bins = 200)
```

This is the figure, colored by the doublet cells status (`hybrid_score.class`) :

```{r qc_patchwork_hybrid_score, class.source = "fold-hide", fig.width = 10, fig.height = 8}
aquarius::plot_qc_facslike(df = sobj@meta.data,
                           x = "nFeature_RNA",
                           y = "log_nCount_RNA",
                           col_by = "hybrid_score.class",
                           col_colors = setNames(nm = c(TRUE, FALSE),
                                                 aquarius::gg_color_hue(2)),
                           x_thresh = cut_nFeature_RNA,
                           y_thresh = cut_log_nCount_RNA,
                           bins = 200)
```


### Visualization as piechart

Do filtered cells belong to a particular cell type ?

```{r qc_piechart_cell_type, class.source = "fold-hide", fig.width = 12, fig.height = 9}
sobj$all_cells = TRUE

plot_list = list()

## All cells
df = sobj@meta.data
if (nrow(df) == 0) {
  plot_list[[1]] = ggplot()
} else {
  plot_list[[1]] = aquarius::plot_piechart(df = df,
                                           logical_var = "all_cells",
                                           grouping_var = "cell_type",
                                           colors = color_markers,
                                           display_legend = TRUE) +
    ggplot2::labs(title = "All cells",
                  subtitle = paste(nrow(df), "cells")) +
    ggplot2::theme(plot.title = element_text(hjust = 0.5, face = "bold"),
                   plot.subtitle = element_text(hjust = 0.5))
}

## Doublets consensus
df = sobj@meta.data %>%
  dplyr::filter(doublets_consensus.class)
if (nrow(df) == 0) {
  plot_list[[2]] = ggplot()
} else {
  plot_list[[2]] = aquarius::plot_piechart(df = df,
                                           logical_var = "all_cells",
                                           grouping_var = "cell_type",
                                           colors = color_markers,
                                           display_legend = TRUE) +
    ggplot2::labs(title = "doublets_consensus.class",
                  subtitle = paste(sum(sobj$doublets_consensus.class, na.rm = TRUE), "cells")) +
    ggplot2::theme(plot.title = element_text(hjust = 0.5, face = "bold"),
                   plot.subtitle = element_text(hjust = 0.5))
}

## percent.mt
df = sobj@meta.data %>%
  dplyr::filter(percent.mt > cut_percent.mt)
if (nrow(df) == 0) {
  plot_list[[3]] = ggplot()
} else {
  plot_list[[3]] = aquarius::plot_piechart(df = df,
                                           logical_var = "all_cells",
                                           grouping_var = "cell_type",
                                           colors = color_markers,
                                           display_legend = TRUE) +
    ggplot2::labs(title = paste("percent.mt >", cut_percent.mt),
                  subtitle = paste(length(fail_percent.mt), "cells")) +
    ggplot2::theme(plot.title = element_text(hjust = 0.5, face = "bold"),
                   plot.subtitle = element_text(hjust = 0.5))
}

## percent.rb
df = sobj@meta.data %>%
  dplyr::filter(percent.rb > cut_percent.rb)
if (nrow(df) == 0) {
  plot_list[[4]] = ggplot()
} else {
  plot_list[[4]] = aquarius::plot_piechart(df = df,
                                           logical_var = "all_cells",
                                           grouping_var = "cell_type",
                                           colors = color_markers,
                                           display_legend = TRUE) +
    ggplot2::labs(title = paste("percent.rb >", cut_percent.rb),
                  subtitle = paste(length(fail_percent.rb), "cells")) +
    ggplot2::theme(plot.title = element_text(hjust = 0.5, face = "bold"),
                   plot.subtitle = element_text(hjust = 0.5))
}

## log_nCount_RNA
df = sobj@meta.data %>%
  dplyr::filter(log_nCount_RNA < cut_log_nCount_RNA)
if (nrow(df) == 0) {
  plot_list[[5]] = ggplot()
} else {
  plot_list[[5]] = aquarius::plot_piechart(df = df,
                                           logical_var = "all_cells",
                                           grouping_var = "cell_type",
                                           colors = color_markers,
                                           display_legend = TRUE) +
    ggplot2::labs(title = paste("log_nCount_RNA <", round(cut_log_nCount_RNA, 2)),
                  subtitle = paste(length(fail_log_nCount_RNA), "cells")) +
    ggplot2::theme(plot.title = element_text(hjust = 0.5, face = "bold"),
                   plot.subtitle = element_text(hjust = 0.5))
}

## nFeature_RNA
df = sobj@meta.data %>%
  dplyr::filter(nFeature_RNA < cut_nFeature_RNA)
if (nrow(df) == 0) {
  plot_list[[6]] = ggplot()
} else {
  plot_list[[6]] = aquarius::plot_piechart(df = df,
                                           logical_var = "all_cells",
                                           grouping_var = "cell_type",
                                           colors = color_markers,
                                           display_legend = TRUE) +
    ggplot2::labs(title = paste("nFeature_RNA <", round(cut_nFeature_RNA, 2)),
                  subtitle = paste(length(fail_nFeature_RNA), "cells")) +
    ggplot2::theme(plot.title = element_text(hjust = 0.5, face = "bold"),
                   plot.subtitle = element_text(hjust = 0.5))
}

patchwork::wrap_plots(plot_list, ncol = 3) +
  patchwork::plot_layout(guides = "collect") &
  ggplot2::theme(legend.position = "right")
```

```{r clean_qc_4, echo = FALSE}
sobj$all_cells = NULL

rm(plot_list, df)
```


### Doublet cells status

We can compare doublet detection methods with a Venn diagram : 

```{r qc_venn_doublet, class.source = "fold-hide", fig.width = 8, fig.height = 6}
ggvenn::ggvenn(list(hybrid = fail_doublets_hybrid,
                    scDblFinder = fail_doublets_scDblFinder), 
               fill_color = c("#0073C2FF", "#EFC000FF"),
               stroke_size = 0.5, set_name_size = 4) +
  ggplot2::ggtitle(label = "Doublet cells") +
  ggplot2::theme(plot.title = element_text(hjust = 0.5, face = "bold"))
```

We visualize cells annotation for doublets :

```{r doublets_proj, fig.width = 12, fig.height = 6, class.source = "fold-hide"}
plot_list = list()

# scDblFinder.class
sobj$fail = ifelse(sobj$scDblFinder.class,
                   yes = as.character(sobj$cell_type), no = NA)
sobj$fail = factor(sobj$fail, levels = c(levels(sobj$cell_type), NA))

plot_list[[1]] = Seurat::DimPlot(sobj, group.by = "fail",
                                 na.value = "gray80", cols = color_markers) +
  ggplot2::labs(title = "scDblFinder.class",
                subtitle = paste0(sum(sobj$scDblFinder.class, na.rm = TRUE), " cells")) +
  Seurat::NoAxes() + Seurat::NoLegend() +
  ggplot2::theme(aspect.ratio = 1,
                 plot.title = element_text(hjust = 0.5),
                 plot.subtitle = element_text(hjust = 0.5))

# hybrid_score.class
sobj$fail = ifelse(sobj$hybrid_score.class,
                   yes = as.character(sobj$cell_type), no = NA)
sobj$fail = factor(sobj$fail, levels = c(levels(sobj$cell_type), NA))

plot_list[[2]] = Seurat::DimPlot(sobj, group.by = "fail",
                                 na.value = "gray80", cols = color_markers) +
  ggplot2::labs(title = "hybrid_score.class",
                subtitle = paste0(sum(sobj$hybrid_score.class, na.rm = TRUE), " cells")) +
  Seurat::NoAxes() +
  ggplot2::theme(aspect.ratio = 1,
                 plot.title = element_text(hjust = 0.5),
                 plot.subtitle = element_text(hjust = 0.5))

sobj$fail = NULL

# Plot
patchwork::wrap_plots(plot_list, nrow = 1)
```

## Save

We could save this object before filtering (remove `eval = FALSE`) :

```{r save_sobj_unfiltered_annotated, eval = FALSE}
saveRDS(sobj, paste0(out_dir, "/datasets/", sample_name, "_sobj_unfiltered.rds"))
```


# Filtering

We remove :

* cells with a number of UMI lower than `r cut_log_nCount_RNA`
* cells expressing a number of genes lower than `r cut_nFeature_RNA`
* cells having more than `r cut_percent.mt` \% of UMI related to mitochondrial genes
* cells having more than `r cut_percent.rb` \% of UMI related to ribosomal genes
* doublet cells detected with both method (**scDblFinder** and **scds-hybrid**)


```{r filter_cells}
sobj = subset(sobj, invert = TRUE,
              cells = unique(c(fail_log_nCount_RNA, fail_nFeature_RNA,
                               fail_percent.mt, fail_percent.rb,
                               fail_doublets_consensus)))
sobj
```

```{r clean_filter, echo = FALSE}
rm(fail_doublets_consensus, fail_doublets_scDblFinder, fail_doublets_hybrid,
   fail_percent.mt, fail_percent.rb, fail_log_nCount_RNA, fail_nFeature_RNA,
   cut_percent.mt, cut_percent.rb, cut_log_nCount_RNA, cut_nFeature_RNA)
```


# Post-filtering processing

## Normalization

We normalize the count matrix for remaining cells :

```{r normalization_3}
sobj = Seurat::NormalizeData(sobj,
                             normalization.method = "LogNormalize",
                             assay = "RNA")

sobj = Seurat::FindVariableFeatures(sobj,
                                    assay = "RNA",
                                    nfeatures = 3000)
sobj
```

## Projection

We perform a PCA :

```{r pca, fig.width = 12, fig.height = 4}
sobj = aquarius::dimensions_reduction(sobj = sobj,
                                      assay = "RNA",
                                      reduction = "pca",
                                      max_dims = 100,
                                      verbose = FALSE)
Seurat::ElbowPlot(sobj, ndims = 100, reduction = "RNA_pca")
```


We generate a tSNE and a UMAP with 20 principal components :

```{r tsne_umap}
ndims = 20
sobj = Seurat::RunTSNE(sobj,
                       reduction = "RNA_pca",
                       dims = 1:ndims,
                       seed.use = 1337L,
                       reduction.name = paste0("RNA_pca_", ndims, "_tsne"))

sobj = Seurat::RunUMAP(sobj,
                       reduction = "RNA_pca",
                       dims = 1:ndims,
                       seed.use = 1337L,
                       reduction.name = paste0("RNA_pca_", ndims, "_umap"))
```


## Annotation

We annotate cells for cell type, with the new normalized expression matrix :

```{r cell_type_2, time_it = TRUE}
score_columns = grep(x = colnames(sobj@meta.data), pattern = "^score", value = TRUE)
sobj@meta.data[, score_columns] = NULL
sobj$cell_type = NULL

sobj = aquarius::cell_annot_custom(sobj,
                                   newname = "cell_type",
                                   markers = cell_markers,
                                   use_negative = TRUE,
                                   add_score = TRUE,
                                   verbose = TRUE)

sobj$cell_type = factor(sobj$cell_type, levels = names(cell_markers))

colnames(sobj@meta.data) = stringr::str_replace_all(string = colnames(sobj@meta.data),
                                                    pattern = " ",
                                                    replacement = "_")

table(sobj$cell_type)
```


To justify cell type annotation, we can make a dotplot :

```{r dotplot_cell_type_short2, fig.width = 12, fig.height = 9}
markers = c("PTPRC", unique(unlist(dotplot_markers[levels(sobj$cell_type)])))
markers = markers[markers %in% rownames(sobj)]

aquarius::plot_dotplot(sobj, assay = "RNA",
                       column_name = "cell_type",
                       markers = markers,
                       nb_hline = 0) +
  ggplot2::scale_color_gradientn(colors = aquarius:::color_gene) +
  ggplot2::theme(legend.position = "right",
                 legend.box = "vertical",
                 legend.direction = "vertical",
                 axis.title = element_blank(),
                 axis.text = element_text(size = 15))
```

We can make a barplot to see the composition of each dataset, and visualize cell types on the projection.

```{r barplot_celltype2, fig.height = 6, fig.width = 8, class.source = "fold-hide"}
df_proportion = as.data.frame(prop.table(table(sobj$orig.ident,
                                               sobj$cell_type)))
colnames(df_proportion) = c("orig.ident", "cell_type", "freq")

quantif = table(sobj$orig.ident) %>%
  as.data.frame.table() %>%
  `colnames<-`(c("orig.ident", "nb_cells"))

# Plot
plot_list = list()

plot_list[[2]] = aquarius::plot_barplot(df = df_proportion,
                                        x = "orig.ident",
                                        y = "freq",
                                        fill = "cell_type",
                                        position = ggplot2::position_fill()) +
  ggplot2::scale_fill_manual(name = "Cell type",
                             values = color_markers[levels(df_proportion$cell_type)],
                             breaks = levels(df_proportion$cell_type)) +
  ggplot2::geom_label(data = quantif, inherit.aes = FALSE,
                      aes(x = orig.ident, y = 1.05, label = nb_cells),
                      label.size = 0)

plot_list[[1]] = Seurat::DimPlot(sobj, group.by = "cell_type",
                                 reduction = "RNA_pca_20_tsne") +
  ggplot2::scale_color_manual(values = unlist(color_markers),
                              breaks = names(color_markers)) +
  ggplot2::labs(title = sample_name,
                subtitle = paste0(ncol(sobj), " cells")) +
  Seurat::NoLegend() + Seurat::NoAxes() +
  ggplot2::theme(aspect.ratio = 1,
                 plot.title = element_text(hjust = 0.5),
                 plot.subtitle = element_text(hjust = 0.5))

patchwork::wrap_plots(plot_list, nrow = 1, widths = c(6, 1))
```


### Cell cycle

We annotate cells for cell cycle phase :

```{r cell_cycle2, time_it = TRUE}
cc_columns = aquarius::add_cell_cycle(sobj = sobj,
                                      assay = "RNA",
                                      species_rdx = "hs",
                                      BPPARAM = cl)@meta.data[, c("Seurat.Phase", "Phase")]

sobj$Seurat.Phase = cc_columns$Seurat.Phase
sobj$cyclone.Phase = cc_columns$Phase

table(sobj$Seurat.Phase, sobj$cyclone.Phase)
```


We visualize cell cycle on the projection :

```{r see_cell_cycle2, fig.width = 12, fig.height = 7, class.source = "fold-hide"}
plot_list = list()

plot_list[[2]] = Seurat::DimPlot(sobj, group.by = "Seurat.Phase",
                                 reduction = "RNA_pca_20_tsne") +
  ggplot2::labs(title = "Cell Cycle Phase",
                subtitle = "Seurat.Phase") +
  Seurat::NoLegend() + Seurat::NoAxes() +
  ggplot2::theme(aspect.ratio = 1,
                 plot.title = element_text(hjust = 0.5),
                 plot.subtitle = element_text(hjust = 0.5))

plot_list[[1]] = Seurat::DimPlot(sobj, group.by = "cyclone.Phase",
                                 reduction = "RNA_pca_20_tsne") +
  ggplot2::labs(title = "Cell Cycle Phase",
                subtitle = "cyclone.Phase") +
  Seurat::NoLegend() + Seurat::NoAxes() +
  ggplot2::theme(aspect.ratio = 1,
                 plot.title = element_text(hjust = 0.5),
                 plot.subtitle = element_text(hjust = 0.5))

patchwork::wrap_plots(plot_list, nrow = 1)
```


## Clustering

We make a highly resolutive clustering :

```{r clustering}
sobj = Seurat::FindNeighbors(sobj, reduction = "RNA_pca", dims = c(1:ndims))
sobj = Seurat::FindClusters(sobj, resolution = 2)

table(sobj$seurat_clusters)
```


# Visualization

## Cell type

We can visualize the cell type :

```{r see_cell_type, fig.width = 12, fig.height = 6, class.source = "fold-hide"}
tsne = Seurat::DimPlot(sobj, group.by = "cell_type",
                       reduction = paste0("RNA_pca_", ndims, "_tsne"), cols = color_markers) +
  Seurat::NoAxes() + ggplot2::ggtitle("tSNE") +
  ggplot2::theme(aspect.ratio = 1,
                 plot.title = element_text(hjust = 0.5),
                 legend.position = "none")

umap = Seurat::DimPlot(sobj, group.by = "cell_type",
                       reduction = paste0("RNA_pca_", ndims, "_umap"), cols = color_markers) +
  Seurat::NoAxes() + ggplot2::ggtitle("UMAP") +
  ggplot2::theme(aspect.ratio = 1,
                 plot.title = element_text(hjust = 0.5))

tsne | umap
```

## Cell cycle

We can visualize the cell cycle, from Seurat :

```{r see_cc_Seurat, fig.width = 12, fig.height = 6, class.source = "fold-hide"}
tsne = Seurat::DimPlot(sobj, group.by = "Seurat.Phase",
                       reduction = paste0("RNA_pca_", ndims, "_tsne")) +
  Seurat::NoAxes() + ggplot2::ggtitle("tSNE") +
  ggplot2::theme(aspect.ratio = 1,
                 plot.title = element_text(hjust = 0.5),
                 legend.position = "none")

umap = Seurat::DimPlot(sobj, group.by = "Seurat.Phase",
                       reduction = paste0("RNA_pca_", ndims, "_umap")) +
  Seurat::NoAxes() + ggplot2::ggtitle("UMAP") +
  ggplot2::theme(aspect.ratio = 1,
                 plot.title = element_text(hjust = 0.5))

tsne | umap
```


We can visualize the cell cycle, from cyclone :

```{r see_cc_cyclone, fig.width = 12, fig.height = 6, class.source = "fold-hide"}
tsne = Seurat::DimPlot(sobj, group.by = "cyclone.Phase",
                       reduction = paste0("RNA_pca_", ndims, "_tsne")) +
  Seurat::NoAxes() + ggplot2::ggtitle("tSNE") +
  ggplot2::theme(aspect.ratio = 1,
                 plot.title = element_text(hjust = 0.5),
                 legend.position = "none")

umap = Seurat::DimPlot(sobj, group.by = "cyclone.Phase",
                       reduction = paste0("RNA_pca_", ndims, "_umap")) +
  Seurat::NoAxes() + ggplot2::ggtitle("UMAP") +
  ggplot2::theme(aspect.ratio = 1,
                 plot.title = element_text(hjust = 0.5))

tsne | umap
```


## Clusters

We visualize the clustering :

```{r see_clusters, fig.width = 12, fig.height = 6, class.source = "fold-hide"}
tsne = Seurat::DimPlot(sobj, group.by = "seurat_clusters", label = TRUE,
                       reduction = paste0("RNA_pca_", ndims, "_tsne")) +
  Seurat::NoAxes() + ggplot2::ggtitle("tSNE") +
  ggplot2::theme(aspect.ratio = 1,
                 plot.title = element_text(hjust = 0.5),
                 legend.position = "none")

umap = Seurat::DimPlot(sobj, group.by = "seurat_clusters", label = TRUE,
                       reduction = paste0("RNA_pca_", ndims, "_umap")) +
  Seurat::NoAxes() + ggplot2::ggtitle("UMAP") +
  ggplot2::theme(aspect.ratio = 1,
                 plot.title = element_text(hjust = 0.5))

tsne | umap
```


## Gene expression

We visualize all cell types markers on the tSNE :

```{r plot_genes, fig.width = 12, fig.height = 20}
markers = dotplot_markers %>% unlist() %>% unname()
markers = markers[markers %in% rownames(sobj)]

plot_list = lapply(markers,
                   FUN = function(one_gene) {
                     p = Seurat::FeaturePlot(sobj, features = one_gene,
                                             reduction = paste0("RNA_pca_", ndims, "_tsne")) +
                       ggplot2::labs(title = one_gene) +
                       ggplot2::scale_color_gradientn(colors = aquarius::color_gene) +
                       ggplot2::theme(aspect.ratio = 1,
                                      plot.subtitle = element_text(hjust = 0.5)) +
                       Seurat::NoAxes()
                     return(p)
                   })

patchwork::wrap_plots(plot_list, ncol = 4)
```


# Save

We save the annotated and filtered Seurat object :

```{r save_sobj_filtered_annotated}
saveRDS(sobj, file = paste0(out_dir, "/datasets/", sample_name, "_sobj_filtered.rds"))
```


# R session

```{r sessioninfo, echo = FALSE, fold_output = TRUE}
sessionInfo()
```
